import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
Load the data and view its contents:
df = pd.read_excel("data/default-of-credit-card clients.xls", skiprows=1)
df.head(10)
ID | LIMIT_BAL | SEX | EDUCATION | MARRIAGE | AGE | PAY_0 | PAY_2 | PAY_3 | PAY_4 | ... | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | default payment next month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 20000 | 2 | 2 | 1 | 24 | 2 | 2 | -1 | -1 | ... | 0 | 0 | 0 | 0 | 689 | 0 | 0 | 0 | 0 | 1 |
1 | 2 | 120000 | 2 | 2 | 2 | 26 | -1 | 2 | 0 | 0 | ... | 3272 | 3455 | 3261 | 0 | 1000 | 1000 | 1000 | 0 | 2000 | 1 |
2 | 3 | 90000 | 2 | 2 | 2 | 34 | 0 | 0 | 0 | 0 | ... | 14331 | 14948 | 15549 | 1518 | 1500 | 1000 | 1000 | 1000 | 5000 | 0 |
3 | 4 | 50000 | 2 | 2 | 1 | 37 | 0 | 0 | 0 | 0 | ... | 28314 | 28959 | 29547 | 2000 | 2019 | 1200 | 1100 | 1069 | 1000 | 0 |
4 | 5 | 50000 | 1 | 2 | 1 | 57 | -1 | 0 | -1 | 0 | ... | 20940 | 19146 | 19131 | 2000 | 36681 | 10000 | 9000 | 689 | 679 | 0 |
5 | 6 | 50000 | 1 | 1 | 2 | 37 | 0 | 0 | 0 | 0 | ... | 19394 | 19619 | 20024 | 2500 | 1815 | 657 | 1000 | 1000 | 800 | 0 |
6 | 7 | 500000 | 1 | 1 | 2 | 29 | 0 | 0 | 0 | 0 | ... | 542653 | 483003 | 473944 | 55000 | 40000 | 38000 | 20239 | 13750 | 13770 | 0 |
7 | 8 | 100000 | 2 | 2 | 2 | 23 | 0 | -1 | -1 | 0 | ... | 221 | -159 | 567 | 380 | 601 | 0 | 581 | 1687 | 1542 | 0 |
8 | 9 | 140000 | 2 | 3 | 1 | 28 | 0 | 0 | 2 | 0 | ... | 12211 | 11793 | 3719 | 3329 | 0 | 432 | 1000 | 1000 | 1000 | 0 |
9 | 10 | 20000 | 1 | 3 | 2 | 35 | -2 | -2 | -2 | -2 | ... | 0 | 13007 | 13912 | 0 | 0 | 0 | 13007 | 1122 | 0 | 0 |
10 rows × 25 columns
Split the data set into 80% training and 20% testing portions:
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
Train the model using a binary classification algoritm:
param = {
"label": "y_train",
"max_depth": 2,
"eta": 1,
"silent": 1,
"booster": "gblinear",
"objective": "binary:logistic"}
num_round = 20
bst = xgb.train(param, dtrain, num_round)
Serialize the trained model to a file for later use:
bst.save_model("xgb.model")
Generate model predictions on the test data and calculate the accuracy of the model:
preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])
print(precision_score(y_test, best_preds, average='macro', zero_division=0))
0.3894166666666667
Let's generate a prediction for an account that we know has a good payment history:
input_data = np.array([35, 500000, 1, 1, 1, 58, -2, -2, -2, -2, -2, -2, 13709, 5006, 31130, 3180, 0, 5293, 5006, 31178, 3180, 0, 5293, 768]).reshape((1,-1))
input_df = pd.DataFrame(input_data, columns=df.columns[:-1])
dinput = xgb.DMatrix(input_df)
bst.predict(dinput)
array([0.04445041], dtype=float32)
Let's generate a prediction for an account that we know defaults on their payment:
input_data = np.array([1, 20000, 2, 2, 1, 24, 2, 2, -1, -1, -2, -2, 3913, 3102, 689, 0, 0, 0, 0, 689, 0, 0, 0, 0]).reshape((1,-1))
input_df = pd.DataFrame(input_data, columns=df.columns[:-1])
dinput = xgb.DMatrix(input_df)
bst.predict(dinput)
array([0.48290196], dtype=float32)