Modeling Health Care Data App
import sys
import os
from scipy import stats
from datetime import datetime, date
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import xgboost as xgb
%matplotlib inline
plt.style.use("fivethirtyeight")
sns.set_context("notebook")
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_predict,cross_validate
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# Import DF
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df.info()
## Data Prep
df = df.drop(columns = ['id'])
# Label Encoding
for f in df.columns:
if df[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[f].values))
df[f] = lbl.transform(list(df[f].values))
pct_list = []
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
if round(pct_missing*100) >0:
pct_list.append([col, round(pct_missing*100)])
print('{} - {}%'.format(col, round(pct_missing*100)))
df = df.fillna(df.mean())
df=df.dropna()
df.info()
Features = ['age','heart_disease','ever_married']
x = df[Features]
y = df["stroke"]
# Train Test split
X_train, X_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=2)
#### Data Preprocessing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
import xgboost as xgb
clf = xgb.XGBClassifier(
n_estimators=500,
max_depth=9,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
missing=-999,
random_state=2021,
tree_method='auto'
# tree_method='hist'
# tree_method='gpu_hist'
)
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
param_grid = {
'colsample_bytree':[.75,1],
'learning_rate':[0.01,0.05,0.1,0.3,0.5],
'max_depth':[1,2,3,5],
'subsample':[.75,1],
'n_estimators': list(range(50, 400, 50))
}
grid_search = GridSearchCV(estimator=clf, scoring='roc_auc', param_grid=param_grid, n_jobs=-1, cv=kfold)
%%time
grid_result = grid_search.fit(X_train, y_train)
print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}','\n')
#Set our final hyperparameters to the tuned values
xgbcl = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
gamma=0.0, max_delta_step=0.0, min_child_weight=1.0,
missing=None, n_jobs=-1, objective='binary:logistic', random_state=42, reg_alpha=0.0,
reg_lambda=1.0, scale_pos_weight=1.0, tree_method='auto',
colsample_bytree = grid_result.best_params_['colsample_bytree'],
learning_rate = grid_result.best_params_['learning_rate'],
max_depth = grid_result.best_params_['max_depth'],
subsample = grid_result.best_params_['subsample'],
n_estimators = grid_result.best_params_['n_estimators'])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#refit the model on k-folds to get stable avg error metrics
scores = cross_validate(estimator=xgbcl, X=X_train, y=y_train, cv=kfold, n_jobs=-1,
scoring=['accuracy', 'roc_auc', 'precision', 'recall', 'f1'])
print('Training 5-fold Cross Validation Results:\n')
print('AUC: ', scores['test_roc_auc'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('Precision: ', scores['test_precision'].mean())
print('Recall: ', scores['test_recall'].mean())
print('F1: ', scores['test_f1'].mean(), '\n')
import sklearn.metrics as metrics
#Fit the final model
xgbcl.fit(X_train, y_train)
#Generate predictions against our training and test data
pred_train = xgbcl.predict(X_train)
proba_train = xgbcl.predict_proba(X_train)
pred_test = xgbcl.predict(X_test)
proba_test = xgbcl.predict_proba(X_test)
# Print model report
print("Classification report (Test): \n")
print(metrics.classification_report(y_test, pred_test))
print("Confusion matrix (Test): \n")
print(metrics.confusion_matrix(y_test, pred_test)/len(y_test))
print ('\nTrain Accuracy:', metrics.accuracy_score(y_train, pred_train))
print ('Test Accuracy:', metrics.accuracy_score(y_test, pred_test))
print ('\nTrain AUC:', metrics.roc_auc_score(y_train, proba_train[:,1]))
print ('Test AUC:', metrics.roc_auc_score(y_test, proba_test[:,1]))
# calculate the fpr and tpr for all thresholds of the classification
train_fpr, train_tpr, train_threshold = metrics.roc_curve(y_train, proba_train[:,1])
test_fpr, test_tpr, test_threshold = metrics.roc_curve(y_test, proba_test[:,1])
train_roc_auc = metrics.auc(train_fpr, train_tpr)
test_roc_auc = metrics.auc(test_fpr, test_tpr)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=[7,5])
plt.title('Receiver Operating Characteristic')
plt.plot(train_fpr, train_tpr, 'b', label = 'Train AUC = %0.2f' % train_roc_auc)
plt.plot(test_fpr, test_tpr, 'g', label = 'Test AUC = %0.2f' % test_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# plot feature importance
xgb.plot_importance(xgbcl, importance_type='gain');
pickle.dump(xgbcl, open('stroke_xgboost_model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))
model = pickle.load(open('stroke_xgboost_model.pkl', 'rb'))
print(model)
# Select feature and target variables:
X = df.drop(['stroke'], axis=1)
y = df[['stroke']]
#One-hot encode the data using pandas get_dummies
X = pd.get_dummies(X)
#rus = RandomUnderSampler(random_state=0, replacement=True)
#X_resampled, y_resampled = rus.fit_resample(X, y)
#print(np.vstack(np.unique([tuple(row) for row in X_resampled], axis=0)).shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
sm = SMOTE(random_state=1)
X_train_SMOTE, y_train_SMOTE = sm.fit_sample(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
model = RandomForestClassifier(n_estimators=100, bootstrap=True,
max_features='sqrt', n_jobs=3, verbose=1, class_weight="balanced")
model.fit(X_train_SMOTE, y_train_SMOTE)
y_pred = model.predict(X_test)
from sklearn.metrics import roc_auc_score
# Calculate roc auc
roc_value = roc_auc_score(y_test, y_pred)
roc_value
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from inspect import signature
precision, recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(precision,recall)
plt.xlabel('Recall')
plt.ylabel('Precision')
# Import numpy and matplotlib
import numpy as np
import matplotlib.pyplot as plt
# Construct the histogram with a flattened 3d array and a range of bins
plt.hist(y_pred_proba.ravel())
# Add a title to the plot
plt.title('Predicted Probability of Stroke')
# Show the plot
plt.show()
len(y_pred_proba)
y_pred
# Get numerical feature importances
importances = list(model.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the features and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
plt.figure(1)
plt.title('Feature Importance')
x_values = list(range(len(importances)))
plt.barh(x_values, importances, align='center')
plt.yticks(x_values, X)
plt.xlabel('Relative Importance')
plt.tight_layout()
import pandas as pd
feature_importances = pd.DataFrame(model.feature_importances_,
index = X_train.columns,
columns=['importance']).sort_values('importance', ascending=False)
importances
from sklearn.metrics import confusion_matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix
sns.set(font_scale=5.0)
conf_mat = confusion_matrix(y_test, y_pred)
cm_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(30,30), dpi = 100)
sns.heatmap(cm_normalized, annot=True, cmap="Blues")
sns.set(font_scale=1)
plt.ylabel('Actual')
plt.xlabel('Predicted')
#fig.savefig('cm_augmented.png', dpi=fig.dpi, transparent=True)
plt.show()
cm_normalized
fig, ax = plt.subplots()
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.5)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = True, labeltop=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
model = RandomForestClassifier(n_estimators=100, bootstrap=True,
max_features='sqrt', n_jobs=3, verbose=1, class_weight="balanced")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
model = RandomForestClassifier(n_estimators=100, bootstrap=True,
max_features='sqrt', n_jobs=3, verbose=1, class_weight="balanced")
model.fit(X_train_SMOTE, y_train_SMOTE)
y_pred = model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
y_pred = model.predict_proba(X_test)[:,1]
train_proba = pd.DataFrame({'predicted_probability': y_pred})
train_proba.info()
##check whether y_train indexes are the same as X_train indexes
same_index = y_test.index == X_test.index
same_index.all()
## get them into the same pandas frame
table = pd.concat([y_test.reset_index(drop=True), train_proba.reset_index(drop=True)], axis=1)
table
table.stroke.value_counts()
table.info()
table.to_csv('../processed_csvs/healthcare_table.csv')
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
#cross validation predictions for test set
y_test_pred = cross_val_predict(forest_clf, X_test, y_test, cv=5)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))
print("Precision:",metrics.precision_score(y_test, y_test_pred))
print("Recall:",metrics.recall_score(y_test, y_test_pred))
#cross validation predictions for full dataset
y_pred = cross_val_predict(forest_clf, X, y, cv=5)
print("Accuracy:",metrics.accuracy_score(y, y_pred))
print("Precision:",metrics.precision_score(y, y_pred))
print("Recall:",metrics.recall_score(y, y_pred))
test_proba = pd.DataFrame({'predicted_probability': y_pred})
test_proba.info()
##check whether y_test indexes are the same as X_test indexes
same_index = y.index == X.index
same_index.all()
## get them into the same pandas frame
table = pd.concat([y.reset_index(drop=True), test_proba.reset_index(drop=True)], axis=1)
table
table.stroke.value_counts()
table.to_csv('../processed_csvs/final_model_table.csv')
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score
models = [
LogisticRegression(solver="liblinear", random_state=42),
RandomForestClassifier(n_estimators=10, random_state=42),
KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
GaussianNB(),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
model_name = model.__class__.__name__
accuracies = cross_val_score(model, X, y, scoring='precision', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'precision'])
sns.boxplot(x='model_name', y='precision', data=cv_df)
sns.stripplot(x='model_name', y='precision', data=cv_df,
size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()