Predicting Onset/Diagnosis of Chronic Conditions, Diabetes
- toc: true
- badges: true
- comments: true
- sticky_rank: 1
- categories: [Big Data , h2o]
National Institute of Diabetes and Digestive and Kidney Diseases, https://www.niddk.nih.gov/
Credit: code from https://www.kaggle.com/sudalairajkumar/getting-started-with-h2o
import h2o
import time
import seaborn
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
%matplotlib inline
h2o.init()
diabetes_df = h2o.import_file("https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/diabetes.csv", destination_frame="diabetes_df")
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/diabetes.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
diabetes_df.describe()
for col in diabetes_df.columns:
diabetes_df[col].hist()
plt.figure(figsize=(10,10))
corr = diabetes_df.cor().as_data_frame()
corr.index = diabetes_df.columns
sns.heatmap(corr, annot = True, cmap='RdYlGn', vmin=-1, vmax=1)
plt.title("Correlation Heatmap", fontsize=16)
plt.show()
train, valid, test = diabetes_df.split_frame(ratios=[0.6,0.2], seed=1234)
response = "Outcome"
train[response] = train[response].asfactor()
valid[response] = valid[response].asfactor()
test[response] = test[response].asfactor()
print("Number of rows in train, valid and test set : ", train.shape[0], valid.shape[0], test.shape[0])
predictors = diabetes_df.columns[:-1]
gbm = H2OGradientBoostingEstimator()
gbm.train(x=predictors, y=response, training_frame=train)
print(gbm)
perf = gbm.model_performance(valid)
print(perf)
gbm_tune = H2OGradientBoostingEstimator(
ntrees = 3000,
learn_rate = 0.01,
stopping_rounds = 20,
stopping_metric = "AUC",
col_sample_rate = 0.7,
sample_rate = 0.7,
seed = 1234
)
gbm_tune.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
gbm_tune.model_performance(valid).auc()
from h2o.grid.grid_search import H2OGridSearch
gbm_grid = H2OGradientBoostingEstimator(
ntrees = 3000,
learn_rate = 0.01,
stopping_rounds = 20,
stopping_metric = "AUC",
col_sample_rate = 0.7,
sample_rate = 0.7,
seed = 1234
)
hyper_params = {'max_depth':[4,6,8,10,12]}
grid = H2OGridSearch(gbm_grid, hyper_params,
grid_id='depth_grid',
search_criteria={'strategy': "Cartesian"})
#Train grid search
grid.train(x=predictors,
y=response,
training_frame=train,
validation_frame=valid)
print(grid)
sorted_grid = grid.get_grid(sort_by='auc',decreasing=True)
print(sorted_grid)
cv_gbm = H2OGradientBoostingEstimator(
ntrees = 3000,
learn_rate = 0.05,
stopping_rounds = 20,
stopping_metric = "AUC",
nfolds=4,
seed=2018)
cv_gbm.train(x = predictors, y = response, training_frame = train, validation_frame=valid)
cv_summary = cv_gbm.cross_validation_metrics_summary().as_data_frame()
cv_summary
cv_gbm.model_performance(valid).auc()
XGBoost:
from h2o.estimators import H2OXGBoostEstimator
cv_xgb = H2OXGBoostEstimator(
ntrees = 3000,
learn_rate = 0.05,
stopping_rounds = 20,
stopping_metric = "AUC",
nfolds=4,
seed=2018)
cv_xgb.train(x = predictors, y = response, training_frame = train, validation_frame=valid)
cv_xgb.model_performance(valid).auc()
cv_xgb.varimp_plot()
AutoML : Automatic Machine Learning:
From the H2O AutoML page,
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models = 10, max_runtime_secs=100, seed = 1)
aml.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
automl leaderboard:
lb = aml.leaderboard
lb
metalearner = h2o.get_model(aml.leader.metalearner()['name'])
metalearner.std_coef_plot()