import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 30669 Male 3.0 0 0 No children Rural 95.12 18.0 NaN 0
1 30468 Male 58.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
2 16523 Female 8.0 0 0 No Private Urban 110.89 17.6 NaN 0
3 56543 Female 70.0 0 0 Yes Private Rural 69.04 35.9 formerly smoked 0
4 46136 Male 14.0 0 0 No Never_worked Rural 161.28 19.1 NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ...
43395 56196 Female 10.0 0 0 No children Urban 58.64 20.4 never smoked 0
43396 5450 Female 56.0 0 0 Yes Govt_job Urban 213.61 55.4 formerly smoked 0
43397 28375 Female 82.0 1 0 Yes Private Urban 91.94 28.9 formerly smoked 0
43398 27973 Male 40.0 0 0 Yes Private Urban 99.16 33.2 never smoked 0
43399 36271 Female 82.0 0 0 Yes Private Urban 79.48 20.6 never smoked 0

43400 rows × 12 columns

df = df.drop(columns = ['id'])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
target = df.stroke.values
features = df.drop(columns =["stroke"])
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)
df.dtypes
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object
# Label Encoding
for f in df.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))   
for f in df.columns:
    print(f)
gender
age
hypertension
heart_disease
ever_married
work_type
Residence_type
avg_glucose_level
bmi
smoking_status
stroke
# Label Encoding
for f in df.columns:
    if df[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))
df
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 1 3.0 0 0 0 4 0 95.12 18.0 1 0
1 1 58.0 1 0 1 2 1 87.96 39.2 2 0
2 0 8.0 0 0 0 2 1 110.89 17.6 1 0
3 0 70.0 0 0 1 2 0 69.04 35.9 0 0
4 1 14.0 0 0 0 1 0 161.28 19.1 1 0
... ... ... ... ... ... ... ... ... ... ... ...
43395 0 10.0 0 0 0 4 1 58.64 20.4 2 0
43396 0 56.0 0 0 1 0 1 213.61 55.4 0 0
43397 0 82.0 1 0 1 2 1 91.94 28.9 0 0
43398 1 40.0 0 0 1 2 1 99.16 33.2 2 0
43399 0 82.0 0 0 1 2 1 79.48 20.6 2 0

43400 rows × 11 columns

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))   
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
import xgboost as xgb
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2021,
    tree_method='auto'
#    tree_method='hist'
#    tree_method='gpu_hist'
)
## K Fold Cross Validation (5 Folds)
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
# Define Parameter grid for hyperparameter search process
param_grid = { 
    'colsample_bytree':[.75,1],
    'learning_rate':[0.01,0.05,0.1,0.3,0.5],
    'max_depth':[1,2,3,5],
    'subsample':[.75,1],
    'n_estimators': list(range(50, 400, 50))
}
## Run the GridSearch,optimizing on ROC
grid_search = GridSearchCV(estimator=clf, scoring='roc_auc', param_grid=param_grid, n_jobs=-1, cv=kfold)
%%time
grid_result = grid_search.fit(X_train, y_train)
/home/david/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
[09:05:39] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
CPU times: user 27 s, sys: 996 ms, total: 28 s
Wall time: 20min 15s
print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')
Best: 0.8617766137590029 using {'colsample_bytree': 0.75, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.75} 

#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
         colsample_bytree = grid_result.best_params_['colsample_bytree'], 
         learning_rate = grid_result.best_params_['learning_rate'], 
         max_depth = grid_result.best_params_['max_depth'], 
         subsample = grid_result.best_params_['subsample'], 
         n_estimators = grid_result.best_params_['n_estimators'])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1, 
                        scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])

print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')
Training 5-fold Cross Validation Results:

AUC:  0.8632093427558646
Accuracy:  0.981278801843318
Precision:  0.0
Recall:  0.0
F1:  0.0 

import sklearn.metrics as metrics
#Fit the final model
xgbcl.fit(X_train, y_train)

#Generate predictions against our training and test data
pred_train = xgbcl.predict(X_train)
proba_train = xgbcl.predict_proba(X_train)
pred_test = xgbcl.predict(X_test)
proba_test = xgbcl.predict_proba(X_test)

# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))

print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))

print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))

# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])

train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# plot feature importance
xgb.plot_importance(xgbcl, importance_type='gain');
[20:30:35] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Classification report (Test): 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8547
           1       0.00      0.00      0.00       133

    accuracy                           0.98      8680
   macro avg       0.49      0.50      0.50      8680
weighted avg       0.97      0.98      0.98      8680

Confusion matrix (Test): 

[[0.98467742 0.        ]
 [0.01532258 0.        ]]

Train Accuracy: 0.981278801843318
Test Accuracy: 0.9846774193548387

Train AUC: 0.8907315481700571
Test AUC: 0.8661175578468812
/home/david/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
#take a random row of data
X_rand = features.sample(1, random_state = 5)
display(df.iloc[X_rand.index])
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
33658 0 60.0 0 0 1 2 0 108.13 28.6 2 0
X = features
#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
         colsample_bytree = grid_result.best_params_['colsample_bytree'], 
         learning_rate = grid_result.best_params_['learning_rate'], 
         max_depth = grid_result.best_params_['max_depth'], 
         subsample = grid_result.best_params_['subsample'], 
         n_estimators = grid_result.best_params_['n_estimators'])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1, 
                        scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])

print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-36f8657b8499> in <module>
      4          missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
      5          reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
----> 6          colsample_bytree = grid_result.best_params_['colsample_bytree'],
      7          learning_rate = grid_result.best_params_['learning_rate'],
      8          max_depth = grid_result.best_params_['max_depth'],

NameError: name 'grid_result' is not defined
#Generate predictions against our training and test data
pred_train = clf.predict(X_train)
proba_train = clf.predict_proba(X_train)
pred_test = clf.predict(X_test)
proba_test = clf.predict_proba(X_test)
import sklearn.metrics as metrics
# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))

print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))

print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))
Classification report (Test): 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8547
           1       0.20      0.02      0.03       133

    accuracy                           0.98      8680
   macro avg       0.59      0.51      0.51      8680
weighted avg       0.97      0.98      0.98      8680

Confusion matrix (Test): 

[[9.83755760e-01 9.21658986e-04]
 [1.50921659e-02 2.30414747e-04]]

Train Accuracy: 0.9959389400921659
Test Accuracy: 0.9839861751152074

Train AUC: 0.9999601273396401
Test AUC: 0.835284948066903
# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])

train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# plot feature importance
xgb.plot_importance(clf, importance_type='gain');
#take a random row of data
X_rand = features.sample(1, random_state = 5)
display(df.iloc[X_rand.index])
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
33658 Female 60.0 0 0 Yes Private Rural 108.13 28.6 never smoked 0
import shap
## kernel shap sends data as numpy array which has no column names, so we fix it
def xgb_predict(data_asarray):
    data_asframe =  pd.DataFrame(data_asarray, columns=feature_names)
    return estimator.predict(data_asframe)
#### Kernel SHAP
X_summary = shap.kmeans(X_train, 10)
#shap_kernel_explainer = shap.KernelExplainer(xgbcl, X_summary)
def xgb_predict(df):
    data_asframe =  pd.DataFrame(df, columns=feature_names)
    return estimator.predict(data_asframe)
#### Tree SHAP
shap_tree_explainer = shap.TreeExplainer(xgbcl, feature_perturbation = "interventional")
shap.initjs()
## shapely values with kernel SHAP
shap.force_plot(shap_tree_explainer.expected_value, shap_values_single, X_test.iloc[[5]])
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap.initjs()
## shapely values with Tree SHAP
shap_values_single = shap_tree_explainer.shap_values(X_test.iloc[[10]])
shap.force_plot(shap_tree_explainer.expected_value, shap_values_single, X_test.iloc[[5]])
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap_values = shap_tree_explainer.shap_values(X_train)
shap.summary_plot(shap_values, features)
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-71-0722fac4ad3c> in <module>
      1 shap_values = shap_tree_explainer.shap_values(X_train)
----> 2 shap.summary_plot(shap_values, features)

~/anaconda3/lib/python3.8/site-packages/shap/plots/_beeswarm.py in summary_legacy(shap_values, features, feature_names, max_display, plot_type, color, axis_color, title, alpha, show, sort, color_bar, plot_size, layered_violin_max_num_bins, class_names, class_inds, color_bar_label, cmap, auto_size_plot, use_log_scale)
    635                     vmin = vmax
    636 
--> 637                 assert features.shape[0] == len(shaps), "Feature and SHAP matrices must have the same number of rows!"
    638 
    639                 # plot the nan values in the interaction feature as grey

AssertionError: Feature and SHAP matrices must have the same number of rows!
# #Display all features and SHAP values
# display(pd.DataFrame(data=shap_values, columns=X_train.columns, index=[126]).transpose().sort_values(by=126, ascending=True))
X_train.columns
Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')
shap.dependence_plot('age', shap_values, X_train, interaction_index='bmi')
shap.dependence_plot('bmi', shap_values, X_train) #when we don't specify an interaction_index, the strongest one is automatically chosen for us
shap.dependence_plot('heart_disease', shap_values, X_train, interaction_index='age')