import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np

url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df

df = df.drop(columns = ['id'])

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

target = df.stroke.values
features = df.drop(columns =["stroke"])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

# Label Encoding
for f in df.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

for f in df.columns:
    print(f)

gender
age
hypertension
heart_disease
ever_married
work_type
Residence_type
avg_glucose_level
bmi
smoking_status
stroke

# Label Encoding
for f in df.columns:
    if df[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))

df

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate

import xgboost as xgb

clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    missing=-999,
    random_state=2021,
    tree_method='auto'
#    tree_method='hist'
#    tree_method='gpu_hist'
)

## K Fold Cross Validation (5 Folds)

kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Define Parameter grid for hyperparameter search process

param_grid = { 
    'colsample_bytree':[.75,1],
    'learning_rate':[0.01,0.05,0.1,0.3,0.5],
    'max_depth':[1,2,3,5],
    'subsample':[.75,1],
    'n_estimators': list(range(50, 400, 50))
}

## Run the GridSearch,optimizing on ROC

grid_search = GridSearchCV(estimator=clf, scoring='roc_auc', param_grid=param_grid, n_jobs=-1, cv=kfold)

%%time
grid_result = grid_search.fit(X_train, y_train)

/home/david/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[09:05:39] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
CPU times: user 27 s, sys: 996 ms, total: 28 s
Wall time: 20min 15s

print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')

Best: 0.8617766137590029 using {'colsample_bytree': 0.75, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.75}

#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
         colsample_bytree = grid_result.best_params_['colsample_bytree'], 
         learning_rate = grid_result.best_params_['learning_rate'], 
         max_depth = grid_result.best_params_['max_depth'], 
         subsample = grid_result.best_params_['subsample'], 
         n_estimators = grid_result.best_params_['n_estimators'])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1, 
                        scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])

print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')

Training 5-fold Cross Validation Results:

AUC:  0.8632093427558646
Accuracy:  0.981278801843318
Precision:  0.0
Recall:  0.0
F1:  0.0

import sklearn.metrics as metrics

#Fit the final model
xgbcl.fit(X_train, y_train)

#Generate predictions against our training and test data
pred_train = xgbcl.predict(X_train)
proba_train = xgbcl.predict_proba(X_train)
pred_test = xgbcl.predict(X_test)
proba_test = xgbcl.predict_proba(X_test)

# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))

print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))

print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))

# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])

train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# plot feature importance
xgb.plot_importance(xgbcl, importance_type='gain');

[20:30:35] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Classification report (Test): 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8547
           1       0.00      0.00      0.00       133

    accuracy                           0.98      8680
   macro avg       0.49      0.50      0.50      8680
weighted avg       0.97      0.98      0.98      8680

Confusion matrix (Test): 

[[0.98467742 0.        ]
 [0.01532258 0.        ]]

Train Accuracy: 0.981278801843318
Test Accuracy: 0.9846774193548387

Train AUC: 0.8907315481700571
Test AUC: 0.8661175578468812

/home/david/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

#take a random row of data
X_rand = features.sample(1, random_state = 5)
display(df.iloc[X_rand.index])

X = features

#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
         gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
         missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
         reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
         colsample_bytree = grid_result.best_params_['colsample_bytree'], 
         learning_rate = grid_result.best_params_['learning_rate'], 
         max_depth = grid_result.best_params_['max_depth'], 
         subsample = grid_result.best_params_['subsample'], 
         n_estimators = grid_result.best_params_['n_estimators'])

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1, 
                        scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])

print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-36f8657b8499> in <module>
      4          missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
      5          reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
----> 6          colsample_bytree = grid_result.best_params_['colsample_bytree'],
      7          learning_rate = grid_result.best_params_['learning_rate'],
      8          max_depth = grid_result.best_params_['max_depth'],

NameError: name 'grid_result' is not defined

#Generate predictions against our training and test data
pred_train = clf.predict(X_train)
proba_train = clf.predict_proba(X_train)
pred_test = clf.predict(X_test)
proba_test = clf.predict_proba(X_test)

import sklearn.metrics as metrics
# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))

print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))

print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))

Classification report (Test): 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8547
           1       0.20      0.02      0.03       133

    accuracy                           0.98      8680
   macro avg       0.59      0.51      0.51      8680
weighted avg       0.97      0.98      0.98      8680

Confusion matrix (Test): 

[[9.83755760e-01 9.21658986e-04]
 [1.50921659e-02 2.30414747e-04]]

Train Accuracy: 0.9959389400921659
Test Accuracy: 0.9839861751152074

Train AUC: 0.9999601273396401
Test AUC: 0.835284948066903

# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])

train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# plot feature importance
xgb.plot_importance(clf, importance_type='gain');

#take a random row of data
X_rand = features.sample(1, random_state = 5)
display(df.iloc[X_rand.index])

import shap

## kernel shap sends data as numpy array which has no column names, so we fix it
def xgb_predict(data_asarray):
    data_asframe =  pd.DataFrame(data_asarray, columns=feature_names)
    return estimator.predict(data_asframe)

#### Kernel SHAP
X_summary = shap.kmeans(X_train, 10)

#shap_kernel_explainer = shap.KernelExplainer(xgbcl, X_summary)

def xgb_predict(df):
    data_asframe =  pd.DataFrame(df, columns=feature_names)
    return estimator.predict(data_asframe)

#### Tree SHAP
shap_tree_explainer = shap.TreeExplainer(xgbcl, feature_perturbation = "interventional")

shap.initjs()
## shapely values with kernel SHAP
shap.force_plot(shap_tree_explainer.expected_value, shap_values_single, X_test.iloc[[5]])

shap.initjs()
## shapely values with Tree SHAP
shap_values_single = shap_tree_explainer.shap_values(X_test.iloc[[10]])
shap.force_plot(shap_tree_explainer.expected_value, shap_values_single, X_test.iloc[[5]])

shap_values = shap_tree_explainer.shap_values(X_train)
shap.summary_plot(shap_values, features)

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-71-0722fac4ad3c> in <module>
      1 shap_values = shap_tree_explainer.shap_values(X_train)
----> 2 shap.summary_plot(shap_values, features)

~/anaconda3/lib/python3.8/site-packages/shap/plots/_beeswarm.py in summary_legacy(shap_values, features, feature_names, max_display, plot_type, color, axis_color, title, alpha, show, sort, color_bar, plot_size, layered_violin_max_num_bins, class_names, class_inds, color_bar_label, cmap, auto_size_plot, use_log_scale)
    635                     vmin = vmax
    636 
--> 637                 assert features.shape[0] == len(shaps), "Feature and SHAP matrices must have the same number of rows!"
    638 
    639                 # plot the nan values in the interaction feature as grey

AssertionError: Feature and SHAP matrices must have the same number of rows!

# #Display all features and SHAP values
# display(pd.DataFrame(data=shap_values, columns=X_train.columns, index=[126]).transpose().sort_values(by=126, ascending=True))

X_train.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')

shap.dependence_plot('age', shap_values, X_train, interaction_index='bmi')
shap.dependence_plot('bmi', shap_values, X_train) #when we don't specify an interaction_index, the strongest one is automatically chosen for us
shap.dependence_plot('heart_disease', shap_values, X_train, interaction_index='age')

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	30669	Male	3.0	0	0	No	children	Rural	95.12	18.0	NaN	0
1	30468	Male	58.0	1	0	Yes	Private	Urban	87.96	39.2	never smoked	0
2	16523	Female	8.0	0	0	No	Private	Urban	110.89	17.6	NaN	0
3	56543	Female	70.0	0	0	Yes	Private	Rural	69.04	35.9	formerly smoked	0
4	46136	Male	14.0	0	0	No	Never_worked	Rural	161.28	19.1	NaN	0
...	...	...	...	...	...	...	...	...	...	...	...	...
43395	56196	Female	10.0	0	0	No	children	Urban	58.64	20.4	never smoked	0
43396	5450	Female	56.0	0	0	Yes	Govt_job	Urban	213.61	55.4	formerly smoked	0
43397	28375	Female	82.0	1	0	Yes	Private	Urban	91.94	28.9	formerly smoked	0
43398	27973	Male	40.0	0	0	Yes	Private	Urban	99.16	33.2	never smoked	0
43399	36271	Female	82.0	0	0	Yes	Private	Urban	79.48	20.6	never smoked	0

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	1	3.0	0	0	0	4	0	95.12	18.0	1	0
1	1	58.0	1	0	1	2	1	87.96	39.2	2	0
2	0	8.0	0	0	0	2	1	110.89	17.6	1	0
3	0	70.0	0	0	1	2	0	69.04	35.9	0	0
4	1	14.0	0	0	0	1	0	161.28	19.1	1	0
...	...	...	...	...	...	...	...	...	...	...	...
43395	0	10.0	0	0	0	4	1	58.64	20.4	2	0
43396	0	56.0	0	0	1	0	1	213.61	55.4	0	0
43397	0	82.0	1	0	1	2	1	91.94	28.9	0	0
43398	1	40.0	0	0	1	2	1	99.16	33.2	2	0
43399	0	82.0	0	0	1	2	1	79.48	20.6	2	0