import seaborn as sns
sns.set_theme(context="notebook", font_scale=1.4,
              rc={"figure.constrained_layout.use": True,
                  "figure.figsize": [10, 6]})
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df

df=df.dropna()

df.isnull().sum()

df.columns

sklearn.set_config(display='diagram')

X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']


X = X.select_dtypes(include='number')
X

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X
age hypertension heart_disease avg_glucose_level bmi
1 58.0 1 0 87.96 39.2
3 70.0 0 0 69.04 35.9
6 52.0 0 0 77.59 17.7
7 75.0 0 1 243.53 27.0
8 32.0 0 0 77.67 32.3
... ... ... ... ... ...
43395 10.0 0 0 58.64 20.4
43396 56.0 0 0 213.61 55.4
43397 82.0 1 0 91.94 28.9
43398 40.0 0 0 99.16 33.2
43399 82.0 0 0 79.48 20.6

29072 rows × 5 columns

import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 4, figsize=(20, 10))

for name, ax in zip(df.drop(['id'], axis = 1), axes.ravel()):
    sns.scatterplot(x=name, y='age', ax=ax, data=df)

Model without scaling

Remove categories for this example

feature_names = X.columns
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.neighbors import KNeighborsClassifier

knr = KNeighborsClassifier().fit(X_train, y_train)
knr.score(X_train, y_train)
0.9810126582278481
knr.score(X_test, y_test)
0.9821133736929004

Model with scaling

Scale first!

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
import pandas as pd
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_train_scaled_df.plot(kind='box');

Train model on scaled data

knr = KNeighborsClassifier().fit(X_train_scaled, y_train)
knr.score(X_train_scaled, y_train)
0.9805081636396992
X_test_scaled = scaler.transform(X_test)
knr.score(X_test_scaled, y_test)
0.9815630159603742
# %load solutions/03-ex1-solution.py
from sklearn.svm import SVR

svr_unscaled = SVR()

svr_unscaled.fit(X_train, y_train)

svr_unscaled.score(X_test, y_test)

svr_scaled = SVR()
svr_scaled.fit(X_train_scaled, y_train)

svr_scaled.score(X_test_scaled, y_test)
-0.38405905025130793

Tree based models

from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0, max_depth=3).fit(X_train, y_train)
tree.score(X_test, y_test)
0.9822509631260319
tree_scaled = DecisionTreeClassifier(random_state=0, max_depth=3).fit(X_train_scaled, y_train)
tree_scaled.score(X_test_scaled, y_test)
0.9822509631260319

Why are the scores the same?

from sklearn.tree import plot_tree
sns.reset_orig()
fig, ax = plt.subplots(figsize=(20, 10))
_ = plot_tree(tree, ax=ax, fontsize=16, feature_names=feature_names)
from sklearn.tree import plot_tree
sns.reset_orig()
fig, ax = plt.subplots(figsize=(20, 10))
_ = plot_tree(tree_scaled, ax=ax, fontsize=16, feature_names=feature_names)