Xgboost for Health Data (Pipeline Step 3)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df = df.drop(columns = ['id'])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
target = df.stroke.values
features = df.drop(columns =["stroke"])
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)
df.dtypes
# Label Encoding
for f in df.columns:
if X_train[f].dtype=='object' or X_test[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train[f].values) + list(X_test[f].values))
X_train[f] = lbl.transform(list(X_train[f].values))
X_test[f] = lbl.transform(list(X_test[f].values))
for f in df.columns:
print(f)
# Label Encoding
for f in df.columns:
if df[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[f].values))
df[f] = lbl.transform(list(df[f].values))
df
# Label Encoding
for f in X_train.columns:
if X_train[f].dtype=='object' or X_test[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(X_train[f].values) + list(X_test[f].values))
X_train[f] = lbl.transform(list(X_train[f].values))
X_test[f] = lbl.transform(list(X_test[f].values))
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
import xgboost as xgb
clf = xgb.XGBClassifier(
n_estimators=500,
max_depth=9,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
missing=-999,
random_state=2021,
tree_method='auto'
# tree_method='hist'
# tree_method='gpu_hist'
)
## K Fold Cross Validation (5 Folds)
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
# Define Parameter grid for hyperparameter search process
param_grid = {
'colsample_bytree':[.75,1],
'learning_rate':[0.01,0.05,0.1,0.3,0.5],
'max_depth':[1,2,3,5],
'subsample':[.75,1],
'n_estimators': list(range(50, 400, 50))
}
## Run the GridSearch,optimizing on ROC
grid_search = GridSearchCV(estimator=clf, scoring='roc_auc', param_grid=param_grid, n_jobs=-1, cv=kfold)
%%time
grid_result = grid_search.fit(X_train, y_train)
print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')
#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
colsample_bytree = grid_result.best_params_['colsample_bytree'],
learning_rate = grid_result.best_params_['learning_rate'],
max_depth = grid_result.best_params_['max_depth'],
subsample = grid_result.best_params_['subsample'],
n_estimators = grid_result.best_params_['n_estimators'])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1,
scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])
print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')
import sklearn.metrics as metrics
#Fit the final model
xgbcl.fit(X_train, y_train)
#Generate predictions against our training and test data
pred_train = xgbcl.predict(X_train)
proba_train = xgbcl.predict_proba(X_train)
pred_test = xgbcl.predict(X_test)
proba_test = xgbcl.predict_proba(X_test)
# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))
print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))
print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))
# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])
train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# plot feature importance
xgb.plot_importance(xgbcl, importance_type='gain');
#take a random row of data
X_rand = features.sample(1, random_state = 5)
display(df.iloc[X_rand.index])
X = features
#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
colsample_bytree = grid_result.best_params_['colsample_bytree'],
learning_rate = grid_result.best_params_['learning_rate'],
max_depth = grid_result.best_params_['max_depth'],
subsample = grid_result.best_params_['subsample'],
n_estimators = grid_result.best_params_['n_estimators'])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1,
scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])
print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')
#Generate predictions against our training and test data
pred_train = clf.predict(X_train)
proba_train = clf.predict_proba(X_train)
pred_test = clf.predict(X_test)
proba_test = clf.predict_proba(X_test)
import sklearn.metrics as metrics
# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))
print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))
print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))
# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])
train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# plot feature importance
xgb.plot_importance(clf, importance_type='gain');
#take a random row of data
X_rand = features.sample(1, random_state = 5)
display(df.iloc[X_rand.index])
import shap
## kernel shap sends data as numpy array which has no column names, so we fix it
def xgb_predict(data_asarray):
data_asframe = pd.DataFrame(data_asarray, columns=feature_names)
return estimator.predict(data_asframe)
#### Kernel SHAP
X_summary = shap.kmeans(X_train, 10)
#shap_kernel_explainer = shap.KernelExplainer(xgbcl, X_summary)
def xgb_predict(df):
data_asframe = pd.DataFrame(df, columns=feature_names)
return estimator.predict(data_asframe)
#### Tree SHAP
shap_tree_explainer = shap.TreeExplainer(xgbcl, feature_perturbation = "interventional")
shap.initjs()
## shapely values with kernel SHAP
shap.force_plot(shap_tree_explainer.expected_value, shap_values_single, X_test.iloc[[5]])
shap.initjs()
## shapely values with Tree SHAP
shap_values_single = shap_tree_explainer.shap_values(X_test.iloc[[10]])
shap.force_plot(shap_tree_explainer.expected_value, shap_values_single, X_test.iloc[[5]])
shap_values = shap_tree_explainer.shap_values(X_train)
shap.summary_plot(shap_values, features)
# #Display all features and SHAP values
# display(pd.DataFrame(data=shap_values, columns=X_train.columns, index=[126]).transpose().sort_values(by=126, ascending=True))
X_train.columns
shap.dependence_plot('age', shap_values, X_train, interaction_index='bmi')
shap.dependence_plot('bmi', shap_values, X_train) #when we don't specify an interaction_index, the strongest one is automatically chosen for us
shap.dependence_plot('heart_disease', shap_values, X_train, interaction_index='age')