#Code from https://github.com/pycaret/pycaret/
# check version
from pycaret.utils import version
version()
'2.3.1'

1. Data Repository

import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 30669 Male 3.0 0 0 No children Rural 95.12 18.0 NaN 0
1 30468 Male 58.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
2 16523 Female 8.0 0 0 No Private Urban 110.89 17.6 NaN 0
3 56543 Female 70.0 0 0 Yes Private Rural 69.04 35.9 formerly smoked 0
4 46136 Male 14.0 0 0 No Never_worked Rural 161.28 19.1 NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ...
43395 56196 Female 10.0 0 0 No children Urban 58.64 20.4 never smoked 0
43396 5450 Female 56.0 0 0 Yes Govt_job Urban 213.61 55.4 formerly smoked 0
43397 28375 Female 82.0 1 0 Yes Private Urban 91.94 28.9 formerly smoked 0
43398 27973 Male 40.0 0 0 Yes Private Urban 99.16 33.2 never smoked 0
43399 36271 Female 82.0 0 0 Yes Private Urban 79.48 20.6 never smoked 0

43400 rows × 12 columns

data=df

2. Initialize Setup

from pycaret.classification import *
clf1 = setup(df, target = 'stroke', session_id=123, log_experiment=True, experiment_name='health2')
Description Value
0 session_id 123
1 Target stroke
2 Target Type Binary
3 Label Encoded 0: 0, 1: 1
4 Original Data (43400, 12)
5 Missing Values True
6 Numeric Features 4
7 Categorical Features 7
8 Ordinal Features False
9 High Cardinality Features False
10 High Cardinality Method None
11 Transformed Train Set (30379, 19)
12 Transformed Test Set (13021, 19)
13 Shuffle Train-Test True
14 Stratify Train-Test False
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment True
20 Experiment Name health2
21 USI eaf8
22 Imputation Type simple
23 Iterative Imputation Iteration None
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model None
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model None
28 Unknown Categoricals Handling least_frequent
29 Normalize False
30 Normalize Method None
31 Transformation False
32 Transformation Method None
33 PCA False
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance False
37 Combine Rare Levels False
38 Rare Level Threshold None
39 Numeric Binning False
40 Remove Outliers False
41 Outliers Threshold None
42 Remove Multicollinearity False
43 Multicollinearity Threshold None
44 Clustering False
45 Clustering Iteration None
46 Polynomial Features False
47 Polynomial Degree None
48 Trignometry Features False
49 Polynomial Threshold None
50 Group Features False
51 Feature Selection False
52 Feature Selection Method classic
53 Features Selection Threshold None
54 Feature Interaction False
55 Feature Ratio False
56 Interaction Threshold None
57 Fix Imbalance False
58 Fix Imbalance Method SMOTE

3. Compare Baseline

best_model = compare_models()
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
ridge Ridge Classifier 0.9822 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0210
rf Random Forest Classifier 0.9822 0.7962 0.0000 0.0000 0.0000 0.0000 0.0000 0.7070
lr Logistic Regression 0.9821 0.6887 0.0000 0.0000 0.0000 -0.0002 -0.0007 0.4090
ada Ada Boost Classifier 0.9821 0.8441 0.0000 0.0000 0.0000 -0.0001 -0.0003 0.3870
knn K Neighbors Classifier 0.9819 0.5417 0.0000 0.0000 0.0000 -0.0005 -0.0015 0.0940
lightgbm Light Gradient Boosting Machine 0.9818 0.8354 0.0111 0.2450 0.0210 0.0195 0.0470 0.1320
et Extra Trees Classifier 0.9817 0.7437 0.0037 0.1500 0.0072 0.0060 0.0202 0.4670
gbc Gradient Boosting Classifier 0.9816 0.8480 0.0000 0.0000 0.0000 -0.0012 -0.0027 1.2880
lda Linear Discriminant Analysis 0.9795 0.8395 0.0259 0.1347 0.0433 0.0376 0.0515 0.0560
svm SVM - Linear Kernel 0.9794 0.0000 0.0111 0.0534 0.0178 0.0127 0.0167 0.2770
nb Naive Bayes 0.9657 0.8367 0.1181 0.1022 0.1092 0.0919 0.0923 0.0230
dt Decision Tree Classifier 0.9621 0.5334 0.0889 0.0675 0.0766 0.0576 0.0583 0.0520
qda Quadratic Discriminant Analysis 0.6119 0.5457 0.4771 0.0222 0.0414 0.0085 0.0266 0.0570

4. Create Model

lr = create_model('lr')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9819 0.6751 0.0000 0.0000 0.0000 -0.0006 -0.0024
1 0.9822 0.6980 0.0000 0.0000 0.0000 0.0000 0.0000
2 0.9822 0.6669 0.0000 0.0000 0.0000 0.0000 0.0000
3 0.9822 0.6529 0.0000 0.0000 0.0000 0.0000 0.0000
4 0.9822 0.7013 0.0000 0.0000 0.0000 0.0000 0.0000
5 0.9819 0.6957 0.0000 0.0000 0.0000 -0.0006 -0.0024
6 0.9822 0.7857 0.0000 0.0000 0.0000 0.0000 0.0000
7 0.9819 0.6158 0.0000 0.0000 0.0000 -0.0006 -0.0024
8 0.9819 0.6596 0.0000 0.0000 0.0000 0.0000 0.0000
9 0.9822 0.7355 0.0000 0.0000 0.0000 0.0000 0.0000
Mean 0.9821 0.6887 0.0000 0.0000 0.0000 -0.0002 -0.0007
SD 0.0002 0.0447 0.0000 0.0000 0.0000 0.0003 0.0011
dt = create_model('dt')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9618 0.5351 0.0926 0.0694 0.0794 0.0603 0.0609
1 0.9608 0.5073 0.0370 0.0290 0.0325 0.0128 0.0129
2 0.9631 0.5539 0.1296 0.0972 0.1111 0.0927 0.0937
3 0.9625 0.5172 0.0556 0.0455 0.0500 0.0311 0.0312
4 0.9638 0.5997 0.2222 0.1500 0.1791 0.1613 0.1646
5 0.9628 0.5265 0.0741 0.0597 0.0661 0.0474 0.0477
6 0.9645 0.5455 0.1111 0.0909 0.1000 0.0821 0.0825
7 0.9628 0.5265 0.0741 0.0597 0.0661 0.0474 0.0477
8 0.9562 0.4958 0.0182 0.0125 0.0148 -0.0068 -0.0069
9 0.9631 0.5266 0.0741 0.0606 0.0667 0.0480 0.0483
Mean 0.9621 0.5334 0.0889 0.0675 0.0766 0.0576 0.0583
SD 0.0022 0.0273 0.0542 0.0366 0.0436 0.0444 0.0453
rf = create_model('rf', fold = 5)
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9822 0.8053 0.0000 0.0000 0.0000 0.0000 0.0000
1 0.9822 0.7562 0.0000 0.0000 0.0000 0.0000 0.0000
2 0.9822 0.8108 0.0000 0.0000 0.0000 0.0000 0.0000
3 0.9821 0.7705 0.0000 0.0000 0.0000 0.0000 0.0000
4 0.9822 0.8142 0.0000 0.0000 0.0000 0.0000 0.0000
Mean 0.9822 0.7914 0.0000 0.0000 0.0000 0.0000 0.0000
SD 0.0001 0.0235 0.0000 0.0000 0.0000 0.0000 0.0000
models()
Name Reference Turbo
ID
lr Logistic Regression sklearn.linear_model._logistic.LogisticRegression True
knn K Neighbors Classifier sklearn.neighbors._classification.KNeighborsCl... True
nb Naive Bayes sklearn.naive_bayes.GaussianNB True
dt Decision Tree Classifier sklearn.tree._classes.DecisionTreeClassifier True
svm SVM - Linear Kernel sklearn.linear_model._stochastic_gradient.SGDC... True
rbfsvm SVM - Radial Kernel sklearn.svm._classes.SVC False
gpc Gaussian Process Classifier sklearn.gaussian_process._gpc.GaussianProcessC... False
mlp MLP Classifier sklearn.neural_network._multilayer_perceptron.... False
ridge Ridge Classifier sklearn.linear_model._ridge.RidgeClassifier True
rf Random Forest Classifier sklearn.ensemble._forest.RandomForestClassifier True
qda Quadratic Discriminant Analysis sklearn.discriminant_analysis.QuadraticDiscrim... True
ada Ada Boost Classifier sklearn.ensemble._weight_boosting.AdaBoostClas... True
gbc Gradient Boosting Classifier sklearn.ensemble._gb.GradientBoostingClassifier True
lda Linear Discriminant Analysis sklearn.discriminant_analysis.LinearDiscrimina... True
et Extra Trees Classifier sklearn.ensemble._forest.ExtraTreesClassifier True
lightgbm Light Gradient Boosting Machine lightgbm.sklearn.LGBMClassifier True
models(type='ensemble').index.tolist()
['rf', 'ada', 'gbc', 'et', 'lightgbm']
#ensembled_models = compare_models(whitelist = models(type='ensemble').index.tolist(), fold = 3)

5. Tune Hyperparameters

tuned_lr = tune_model(lr)
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9819 0.6751 0.0000 0.0000 0.0000 -0.0006 -0.0024
1 0.9822 0.8103 0.0000 0.0000 0.0000 0.0000 0.0000
2 0.9822 0.6669 0.0000 0.0000 0.0000 0.0000 0.0000
3 0.9822 0.6529 0.0000 0.0000 0.0000 0.0000 0.0000
4 0.9822 0.7013 0.0000 0.0000 0.0000 0.0000 0.0000
5 0.9819 0.6957 0.0000 0.0000 0.0000 -0.0006 -0.0024
6 0.9822 0.6268 0.0000 0.0000 0.0000 0.0000 0.0000
7 0.9819 0.6158 0.0000 0.0000 0.0000 -0.0006 -0.0024
8 0.9819 0.6596 0.0000 0.0000 0.0000 0.0000 0.0000
9 0.9822 0.7355 0.0000 0.0000 0.0000 0.0000 0.0000
Mean 0.9821 0.6840 0.0000 0.0000 0.0000 -0.0002 -0.0007
SD 0.0002 0.0537 0.0000 0.0000 0.0000 0.0003 0.0011
tuned_rf = tune_model(rf)
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
1 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
2 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
3 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
4 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
5 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
6 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
7 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
8 0.9819 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
9 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
Mean 0.9822 0.5000 0.0000 0.0000 0.0000 0.0000 0.0000
SD 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000

6. Ensemble Model

bagged_dt = ensemble_model(dt)
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9819 0.6107 0.0185 0.3333 0.0351 0.0333 0.0751
1 0.9819 0.6763 0.0000 0.0000 0.0000 -0.0006 -0.0024
2 0.9812 0.6402 0.0000 0.0000 0.0000 -0.0019 -0.0042
3 0.9816 0.5961 0.0000 0.0000 0.0000 -0.0013 -0.0035
4 0.9819 0.6676 0.0000 0.0000 0.0000 -0.0006 -0.0024
5 0.9822 0.6122 0.0000 0.0000 0.0000 0.0000 0.0000
6 0.9822 0.6289 0.0000 0.0000 0.0000 0.0000 0.0000
7 0.9809 0.6649 0.0000 0.0000 0.0000 -0.0025 -0.0049
8 0.9819 0.6254 0.0000 0.0000 0.0000 0.0000 0.0000
9 0.9816 0.6504 0.0000 0.0000 0.0000 -0.0013 -0.0035
Mean 0.9817 0.6373 0.0019 0.0333 0.0035 0.0025 0.0054
SD 0.0004 0.0258 0.0056 0.1000 0.0105 0.0103 0.0233
boosted_dt = ensemble_model(dt, method = 'Boosting')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9615 0.5349 0.0926 0.0685 0.0787 0.0595 0.0602
1 0.9608 0.5073 0.0370 0.0290 0.0325 0.0128 0.0129
2 0.9598 0.5341 0.0926 0.0641 0.0758 0.0559 0.0569
3 0.9602 0.5160 0.0556 0.0411 0.0472 0.0274 0.0277
4 0.9641 0.5908 0.2037 0.1429 0.1679 0.1502 0.1527
5 0.9658 0.5462 0.1111 0.0968 0.1034 0.0861 0.0863
6 0.9638 0.5452 0.1111 0.0882 0.0984 0.0801 0.0807
7 0.9598 0.5250 0.0741 0.0526 0.0615 0.0416 0.0423
8 0.9575 0.5054 0.0364 0.0256 0.0301 0.0090 0.0092
9 0.9648 0.5275 0.0741 0.0656 0.0696 0.0517 0.0518
Mean 0.9618 0.5332 0.0888 0.0674 0.0765 0.0574 0.0581
SD 0.0025 0.0234 0.0460 0.0334 0.0385 0.0393 0.0399

7. Blend Models

blender = blend_models(estimator_list = [boosted_dt, bagged_dt, tuned_rf], method = 'soft')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9819 0.6065 0.0185 0.3333 0.0351 0.0333 0.0751
1 0.9809 0.6705 0.0000 0.0000 0.0000 -0.0025 -0.0049
2 0.9816 0.6368 0.0185 0.2500 0.0345 0.0321 0.0638
3 0.9806 0.5950 0.0000 0.0000 0.0000 -0.0030 -0.0055
4 0.9826 0.6973 0.0185 1.0000 0.0364 0.0357 0.1349
5 0.9819 0.6485 0.0000 0.0000 0.0000 -0.0006 -0.0024
6 0.9816 0.6384 0.0000 0.0000 0.0000 -0.0013 -0.0035
7 0.9816 0.6687 0.0000 0.0000 0.0000 -0.0013 -0.0035
8 0.9809 0.6266 0.0000 0.0000 0.0000 -0.0019 -0.0043
9 0.9809 0.6580 0.0000 0.0000 0.0000 -0.0025 -0.0049
Mean 0.9814 0.6446 0.0056 0.1583 0.0106 0.0088 0.0245
SD 0.0006 0.0293 0.0085 0.3038 0.0162 0.0163 0.0469

8. Stack Models

stacker = stack_models(estimator_list = [boosted_dt,bagged_dt,tuned_rf], meta_model=rf)
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9822 0.7220 0.0000 0.0000 0.0000 0.0000 0.0000
1 0.9822 0.8276 0.0000 0.0000 0.0000 0.0000 0.0000
2 0.9822 0.7600 0.0000 0.0000 0.0000 0.0000 0.0000
3 0.9822 0.8048 0.0000 0.0000 0.0000 0.0000 0.0000
4 0.9822 0.8862 0.0000 0.0000 0.0000 0.0000 0.0000
5 0.9822 0.8140 0.0000 0.0000 0.0000 0.0000 0.0000
6 0.9822 0.7350 0.0000 0.0000 0.0000 0.0000 0.0000
7 0.9822 0.8074 0.0000 0.0000 0.0000 0.0000 0.0000
8 0.9819 0.7859 0.0000 0.0000 0.0000 0.0000 0.0000
9 0.9822 0.7983 0.0000 0.0000 0.0000 0.0000 0.0000
Mean 0.9822 0.7941 0.0000 0.0000 0.0000 0.0000 0.0000
SD 0.0001 0.0450 0.0000 0.0000 0.0000 0.0000 0.0000

9. Analyze Model

plot_model(rf)
plot_model(rf, plot = 'confusion_matrix')
plot_model(rf, plot = 'boundary')
plot_model(rf, plot = 'feature')
plot_model(rf, plot = 'pr')
plot_model(rf, plot = 'class_report')
evaluate_model(rf)
Parameters
bootstrap True
ccp_alpha 0.0
class_weight None
criterion gini
max_depth None
max_features auto
max_leaf_nodes None
max_samples None
min_impurity_decrease 0.0
min_impurity_split None
min_samples_leaf 1
min_samples_split 2
min_weight_fraction_leaf 0.0
n_estimators 100
n_jobs -1
oob_score False
random_state 123
verbose 0
warm_start False

10. Interpret Model

catboost = create_model('rf', cross_validation=False)
interpret_model(catboost)
interpret_model(catboost, plot = 'correlation')
interpret_model(catboost, plot = 'reason', observation = 12)

11. AutoML()

best = automl(optimize = 'Recall')
best

12. Predict Model

pred_holdouts = predict_model(lr)
pred_holdouts.head()
new_data = data.copy()
new_data.drop(['Purchase'], axis=1, inplace=True)
predict_new = predict_model(best, data=new_data)
predict_new.head()

13. Save / Load Model

save_model(best, model_name='best-model')
loaded_bestmodel = load_model('best-model')
print(loaded_bestmodel)
from sklearn import set_config
set_config(display='diagram')
loaded_bestmodel[0]
from sklearn import set_config
set_config(display='text')

14. Deploy Model

deploy_model(best, model_name = 'best-aws', authentication = {'bucket' : 'pycaret-test'})

15. Get Config / Set Config

X_train = get_config('X_train')
X_train.head()
get_config('seed')
from pycaret.classification import set_config
set_config('seed', 999)
get_config('seed')

16. MLFlow UI

# !mlflow ui