Dask for Predicting Onset/Diagnosis of Chronic Conditions, Diabetes
- categories: [Big Data]
import dask
from dask.distributed import Client, progress
from dask.distributed import Client
client = Client(n_workers=4)
client
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/diabetes.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
from dask import dataframe as dd
ddf = dd.from_pandas(df, npartitions=5)
ddf
import dask.dataframe as dd
# Subset of the columns to use
cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
ddf2 = ddf.sample(frac=0.2) # XGBoost requires a bit of RAM, we need a larger cluster
ddf2
diab_diag = (ddf.Outcome) # column of labels
del ddf['Outcome'] # Remove delay information from training dataframe
ddf, diab_diag = dask.persist(ddf, diab_diag) # start work in the background
diab_diag.head()
df2 = dd.get_dummies(ddf.categorize()).persist()
len(df2.columns)
data_train, data_test = df2.random_split([0.9, 0.1],
random_state=1234)
labels_train, labels_test = diab_diag.random_split([0.9, 0.1],
random_state=1234)
%%time
import dask_xgboost as dxgb
params = {'objective': 'binary:logistic', 'nround': 1000,
'max_depth': 16, 'eta': 0.01, 'subsample': 0.5,
'min_child_weight': 1, 'tree_method': 'hist',
'grow_policy': 'lossguide'}
bst = dxgb.train(client, params, data_train, labels_train)
bst
import xgboost as xgb
pandas_df = data_test.head()
dtest = xgb.DMatrix(pandas_df)
bst.predict(dtest)
predictions = dxgb.predict(client, bst, data_test).persist()
predictions
from sklearn.metrics import roc_auc_score, roc_curve
print(roc_auc_score(labels_test.compute(), predictions.compute()))
import matplotlib.pyplot as plt
fpr, tpr, _ = roc_curve(labels_test.compute(), predictions.compute())
# Taken from
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
plt.figure(figsize=(8, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
import dask
import xgboost
import dask_xgboost
%matplotlib inline
import matplotlib.pyplot as plt
ax = xgboost.plot_importance(bst, height=0.8, max_num_features=9)
ax.grid(False, axis="y")
ax.set_title('Estimated feature importance')
plt.show()
y_hat = dask_xgboost.predict(client, bst, data_test).persist()
y_hat
from sklearn.metrics import roc_curve
labels_test, y_hat = dask.compute(labels_test, y_hat)
fpr, tpr, _ = roc_curve(labels_test, y_hat)
from sklearn.metrics import auc
fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(fpr, tpr, lw=3,
label='ROC Curve (area = {:.2f})'.format(auc(fpr, tpr)))
ax.plot([0, 1], [0, 1], 'k--', lw=2)
ax.set(
xlim=(0, 1),
ylim=(0, 1),
title="ROC Curve",
xlabel="False Positive Rate",
ylabel="True Positive Rate",
)
ax.legend();
plt.show()