#Code from https://github.com/pycaret/pycaret/

# check version
from pycaret.utils import version
version()

'2.3.1'

1. Data Repository

import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df

	id	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	30669	Male	3.0	0	0	No	children	Rural	95.12	18.0	NaN	0
1	30468	Male	58.0	1	0	Yes	Private	Urban	87.96	39.2	never smoked	0
2	16523	Female	8.0	0	0	No	Private	Urban	110.89	17.6	NaN	0
3	56543	Female	70.0	0	0	Yes	Private	Rural	69.04	35.9	formerly smoked	0
4	46136	Male	14.0	0	0	No	Never_worked	Rural	161.28	19.1	NaN	0
...	...	...	...	...	...	...	...	...	...	...	...	...
43395	56196	Female	10.0	0	0	No	children	Urban	58.64	20.4	never smoked	0
43396	5450	Female	56.0	0	0	Yes	Govt_job	Urban	213.61	55.4	formerly smoked	0
43397	28375	Female	82.0	1	0	Yes	Private	Urban	91.94	28.9	formerly smoked	0
43398	27973	Male	40.0	0	0	Yes	Private	Urban	99.16	33.2	never smoked	0
43399	36271	Female	82.0	0	0	Yes	Private	Urban	79.48	20.6	never smoked	0

43400 rows × 12 columns

data=df

2. Initialize Setup

from pycaret.classification import *
clf1 = setup(df, target = 'stroke', session_id=123, log_experiment=True, experiment_name='health2')

	Description	Value
0	session_id	123
1	Target	stroke
2	Target Type	Binary
3	Label Encoded	0: 0, 1: 1
4	Original Data	(43400, 12)
5	Missing Values	True
6	Numeric Features	4
7	Categorical Features	7
8	Ordinal Features	False
9	High Cardinality Features	False
10	High Cardinality Method	None
11	Transformed Train Set	(30379, 19)
12	Transformed Test Set	(13021, 19)
13	Shuffle Train-Test	True
14	Stratify Train-Test	False
15	Fold Generator	StratifiedKFold
16	Fold Number	10
17	CPU Jobs	-1
18	Use GPU	False
19	Log Experiment	True
20	Experiment Name	health2
21	USI	eaf8
22	Imputation Type	simple
23	Iterative Imputation Iteration	None
24	Numeric Imputer	mean
25	Iterative Imputation Numeric Model	None
26	Categorical Imputer	constant
27	Iterative Imputation Categorical Model	None
28	Unknown Categoricals Handling	least_frequent
29	Normalize	False
30	Normalize Method	None
31	Transformation	False
32	Transformation Method	None
33	PCA	False
34	PCA Method	None
35	PCA Components	None
36	Ignore Low Variance	False
37	Combine Rare Levels	False
38	Rare Level Threshold	None
39	Numeric Binning	False
40	Remove Outliers	False
41	Outliers Threshold	None
42	Remove Multicollinearity	False
43	Multicollinearity Threshold	None
44	Clustering	False
45	Clustering Iteration	None
46	Polynomial Features	False
47	Polynomial Degree	None
48	Trignometry Features	False
49	Polynomial Threshold	None
50	Group Features	False
51	Feature Selection	False
52	Feature Selection Method	classic
53	Features Selection Threshold	None
54	Feature Interaction	False
55	Feature Ratio	False
56	Interaction Threshold	None
57	Fix Imbalance	False
58	Fix Imbalance Method	SMOTE

3. Compare Baseline

best_model = compare_models()

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
ridge	Ridge Classifier	0.9822	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0210
rf	Random Forest Classifier	0.9822	0.7962	0.0000	0.0000	0.0000	0.0000	0.0000	0.7070
lr	Logistic Regression	0.9821	0.6887	0.0000	0.0000	0.0000	-0.0002	-0.0007	0.4090
ada	Ada Boost Classifier	0.9821	0.8441	0.0000	0.0000	0.0000	-0.0001	-0.0003	0.3870
knn	K Neighbors Classifier	0.9819	0.5417	0.0000	0.0000	0.0000	-0.0005	-0.0015	0.0940
lightgbm	Light Gradient Boosting Machine	0.9818	0.8354	0.0111	0.2450	0.0210	0.0195	0.0470	0.1320
et	Extra Trees Classifier	0.9817	0.7437	0.0037	0.1500	0.0072	0.0060	0.0202	0.4670
gbc	Gradient Boosting Classifier	0.9816	0.8480	0.0000	0.0000	0.0000	-0.0012	-0.0027	1.2880
lda	Linear Discriminant Analysis	0.9795	0.8395	0.0259	0.1347	0.0433	0.0376	0.0515	0.0560
svm	SVM - Linear Kernel	0.9794	0.0000	0.0111	0.0534	0.0178	0.0127	0.0167	0.2770
nb	Naive Bayes	0.9657	0.8367	0.1181	0.1022	0.1092	0.0919	0.0923	0.0230
dt	Decision Tree Classifier	0.9621	0.5334	0.0889	0.0675	0.0766	0.0576	0.0583	0.0520
qda	Quadratic Discriminant Analysis	0.6119	0.5457	0.4771	0.0222	0.0414	0.0085	0.0266	0.0570

4. Create Model

lr = create_model('lr')

	Accuracy	AUC	Kappa	MCC
0	0.9819	0.6751	-0.0006	-0.0024
1	0.9822	0.6980	0.0000	0.0000
2	0.9822	0.6669	0.0000	0.0000
3	0.9822	0.6529	0.0000	0.0000
4	0.9822	0.7013	0.0000	0.0000
5	0.9819	0.6957	-0.0006	-0.0024
6	0.9822	0.7857	0.0000	0.0000
7	0.9819	0.6158	-0.0006	-0.0024
8	0.9819	0.6596	0.0000	0.0000
9	0.9822	0.7355	0.0000	0.0000
Mean	0.9821	0.6887	-0.0002	-0.0007
SD	0.0002	0.0447	0.0003	0.0011

dt = create_model('dt')

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.9618	0.5351	0.0926	0.0694	0.0794	0.0603	0.0609
1	0.9608	0.5073	0.0370	0.0290	0.0325	0.0128	0.0129
2	0.9631	0.5539	0.1296	0.0972	0.1111	0.0927	0.0937
3	0.9625	0.5172	0.0556	0.0455	0.0500	0.0311	0.0312
4	0.9638	0.5997	0.2222	0.1500	0.1791	0.1613	0.1646
5	0.9628	0.5265	0.0741	0.0597	0.0661	0.0474	0.0477
6	0.9645	0.5455	0.1111	0.0909	0.1000	0.0821	0.0825
7	0.9628	0.5265	0.0741	0.0597	0.0661	0.0474	0.0477
8	0.9562	0.4958	0.0182	0.0125	0.0148	-0.0068	-0.0069
9	0.9631	0.5266	0.0741	0.0606	0.0667	0.0480	0.0483
Mean	0.9621	0.5334	0.0889	0.0675	0.0766	0.0576	0.0583
SD	0.0022	0.0273	0.0542	0.0366	0.0436	0.0444	0.0453

rf = create_model('rf', fold = 5)

	Accuracy	AUC
0	0.9822	0.8053
1	0.9822	0.7562
2	0.9822	0.8108
3	0.9821	0.7705
4	0.9822	0.8142
Mean	0.9822	0.7914
SD	0.0001	0.0235

models()

	Name	Reference	Turbo
ID
lr	Logistic Regression	sklearn.linear_model._logistic.LogisticRegression	True
knn	K Neighbors Classifier	sklearn.neighbors._classification.KNeighborsCl...	True
nb	Naive Bayes	sklearn.naive_bayes.GaussianNB	True
dt	Decision Tree Classifier	sklearn.tree._classes.DecisionTreeClassifier	True
svm	SVM - Linear Kernel	sklearn.linear_model._stochastic_gradient.SGDC...	True
rbfsvm	SVM - Radial Kernel	sklearn.svm._classes.SVC	False
gpc	Gaussian Process Classifier	sklearn.gaussian_process._gpc.GaussianProcessC...	False
mlp	MLP Classifier	sklearn.neural_network._multilayer_perceptron....	False
ridge	Ridge Classifier	sklearn.linear_model._ridge.RidgeClassifier	True
rf	Random Forest Classifier	sklearn.ensemble._forest.RandomForestClassifier	True
qda	Quadratic Discriminant Analysis	sklearn.discriminant_analysis.QuadraticDiscrim...	True
ada	Ada Boost Classifier	sklearn.ensemble._weight_boosting.AdaBoostClas...	True
gbc	Gradient Boosting Classifier	sklearn.ensemble._gb.GradientBoostingClassifier	True
lda	Linear Discriminant Analysis	sklearn.discriminant_analysis.LinearDiscrimina...	True
et	Extra Trees Classifier	sklearn.ensemble._forest.ExtraTreesClassifier	True
lightgbm	Light Gradient Boosting Machine	lightgbm.sklearn.LGBMClassifier	True

models(type='ensemble').index.tolist()

['rf', 'ada', 'gbc', 'et', 'lightgbm']

#ensembled_models = compare_models(whitelist = models(type='ensemble').index.tolist(), fold = 3)

5. Tune Hyperparameters

tuned_lr = tune_model(lr)

	Accuracy	AUC	Kappa	MCC
0	0.9819	0.6751	-0.0006	-0.0024
1	0.9822	0.8103	0.0000	0.0000
2	0.9822	0.6669	0.0000	0.0000
3	0.9822	0.6529	0.0000	0.0000
4	0.9822	0.7013	0.0000	0.0000
5	0.9819	0.6957	-0.0006	-0.0024
6	0.9822	0.6268	0.0000	0.0000
7	0.9819	0.6158	-0.0006	-0.0024
8	0.9819	0.6596	0.0000	0.0000
9	0.9822	0.7355	0.0000	0.0000
Mean	0.9821	0.6840	-0.0002	-0.0007
SD	0.0002	0.0537	0.0003	0.0011

tuned_rf = tune_model(rf)

	Accuracy	AUC
0	0.9822	0.5000
1	0.9822	0.5000
2	0.9822	0.5000
3	0.9822	0.5000
4	0.9822	0.5000
5	0.9822	0.5000
6	0.9822	0.5000
7	0.9822	0.5000
8	0.9819	0.5000
9	0.9822	0.5000
Mean	0.9822	0.5000
SD	0.0001	0.0000

6. Ensemble Model

bagged_dt = ensemble_model(dt)

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.9819	0.6107	0.0185	0.3333	0.0351	0.0333	0.0751
1	0.9819	0.6763	0.0000	0.0000	0.0000	-0.0006	-0.0024
2	0.9812	0.6402	0.0000	0.0000	0.0000	-0.0019	-0.0042
3	0.9816	0.5961	0.0000	0.0000	0.0000	-0.0013	-0.0035
4	0.9819	0.6676	0.0000	0.0000	0.0000	-0.0006	-0.0024
5	0.9822	0.6122	0.0000	0.0000	0.0000	0.0000	0.0000
6	0.9822	0.6289	0.0000	0.0000	0.0000	0.0000	0.0000
7	0.9809	0.6649	0.0000	0.0000	0.0000	-0.0025	-0.0049
8	0.9819	0.6254	0.0000	0.0000	0.0000	0.0000	0.0000
9	0.9816	0.6504	0.0000	0.0000	0.0000	-0.0013	-0.0035
Mean	0.9817	0.6373	0.0019	0.0333	0.0035	0.0025	0.0054
SD	0.0004	0.0258	0.0056	0.1000	0.0105	0.0103	0.0233

boosted_dt = ensemble_model(dt, method = 'Boosting')

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.9615	0.5349	0.0926	0.0685	0.0787	0.0595	0.0602
1	0.9608	0.5073	0.0370	0.0290	0.0325	0.0128	0.0129
2	0.9598	0.5341	0.0926	0.0641	0.0758	0.0559	0.0569
3	0.9602	0.5160	0.0556	0.0411	0.0472	0.0274	0.0277
4	0.9641	0.5908	0.2037	0.1429	0.1679	0.1502	0.1527
5	0.9658	0.5462	0.1111	0.0968	0.1034	0.0861	0.0863
6	0.9638	0.5452	0.1111	0.0882	0.0984	0.0801	0.0807
7	0.9598	0.5250	0.0741	0.0526	0.0615	0.0416	0.0423
8	0.9575	0.5054	0.0364	0.0256	0.0301	0.0090	0.0092
9	0.9648	0.5275	0.0741	0.0656	0.0696	0.0517	0.0518
Mean	0.9618	0.5332	0.0888	0.0674	0.0765	0.0574	0.0581
SD	0.0025	0.0234	0.0460	0.0334	0.0385	0.0393	0.0399

7. Blend Models

blender = blend_models(estimator_list = [boosted_dt, bagged_dt, tuned_rf], method = 'soft')

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.9819	0.6065	0.0185	0.3333	0.0351	0.0333	0.0751
1	0.9809	0.6705	0.0000	0.0000	0.0000	-0.0025	-0.0049
2	0.9816	0.6368	0.0185	0.2500	0.0345	0.0321	0.0638
3	0.9806	0.5950	0.0000	0.0000	0.0000	-0.0030	-0.0055
4	0.9826	0.6973	0.0185	1.0000	0.0364	0.0357	0.1349
5	0.9819	0.6485	0.0000	0.0000	0.0000	-0.0006	-0.0024
6	0.9816	0.6384	0.0000	0.0000	0.0000	-0.0013	-0.0035
7	0.9816	0.6687	0.0000	0.0000	0.0000	-0.0013	-0.0035
8	0.9809	0.6266	0.0000	0.0000	0.0000	-0.0019	-0.0043
9	0.9809	0.6580	0.0000	0.0000	0.0000	-0.0025	-0.0049
Mean	0.9814	0.6446	0.0056	0.1583	0.0106	0.0088	0.0245
SD	0.0006	0.0293	0.0085	0.3038	0.0162	0.0163	0.0469

8. Stack Models

stacker = stack_models(estimator_list = [boosted_dt,bagged_dt,tuned_rf], meta_model=rf)

	Accuracy	AUC
0	0.9822	0.7220
1	0.9822	0.8276
2	0.9822	0.7600
3	0.9822	0.8048
4	0.9822	0.8862
5	0.9822	0.8140
6	0.9822	0.7350
7	0.9822	0.8074
8	0.9819	0.7859
9	0.9822	0.7983
Mean	0.9822	0.7941
SD	0.0001	0.0450

9. Analyze Model

plot_model(rf)

plot_model(rf, plot = 'confusion_matrix')

plot_model(rf, plot = 'boundary')

plot_model(rf, plot = 'feature')

plot_model(rf, plot = 'pr')

plot_model(rf, plot = 'class_report')

evaluate_model(rf)

	Parameters
bootstrap	True
ccp_alpha	0.0
class_weight	None
criterion	gini
max_depth	None
max_features	auto
max_leaf_nodes	None
max_samples	None
min_impurity_decrease	0.0
min_impurity_split	None
min_samples_leaf	1
min_samples_split	2
min_weight_fraction_leaf	0.0
n_estimators	100
n_jobs	-1
oob_score	False
random_state	123
verbose	0
warm_start	False

10. Interpret Model

catboost = create_model('rf', cross_validation=False)

interpret_model(catboost)

interpret_model(catboost, plot = 'correlation')

interpret_model(catboost, plot = 'reason', observation = 12)

11. AutoML()

best = automl(optimize = 'Recall')
best

12. Predict Model

pred_holdouts = predict_model(lr)
pred_holdouts.head()

new_data = data.copy()
new_data.drop(['Purchase'], axis=1, inplace=True)
predict_new = predict_model(best, data=new_data)
predict_new.head()

13. Save / Load Model

save_model(best, model_name='best-model')

loaded_bestmodel = load_model('best-model')
print(loaded_bestmodel)

from sklearn import set_config
set_config(display='diagram')
loaded_bestmodel[0]

from sklearn import set_config
set_config(display='text')

14. Deploy Model

deploy_model(best, model_name = 'best-aws', authentication = {'bucket' : 'pycaret-test'})

15. Get Config / Set Config

X_train = get_config('X_train')
X_train.head()

get_config('seed')

from pycaret.classification import set_config
set_config('seed', 999)

get_config('seed')

16. MLFlow UI

# !mlflow ui