Preprocessing example in Sklearn
code adapted from https://github.com/thomasjpfan/ml-workshop-intro
import seaborn as sns
sns.set_theme(context="notebook", font_scale=1.4,
rc={"figure.constrained_layout.use": True,
"figure.figsize": [10, 6]})
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X
import matplotlib.pyplot as plt
fig, axes = plt.subplots(3, 4, figsize=(20, 10))
for name, ax in zip(df.drop(['id'], axis = 1), axes.ravel()):
sns.scatterplot(x=name, y='age', ax=ax, data=df)
Remove categories for this example
feature_names = X.columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.neighbors import KNeighborsClassifier
knr = KNeighborsClassifier().fit(X_train, y_train)
knr.score(X_train, y_train)
knr.score(X_test, y_test)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
import pandas as pd
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_train_scaled_df.plot(kind='box');
knr = KNeighborsClassifier().fit(X_train_scaled, y_train)
knr.score(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
knr.score(X_test_scaled, y_test)
# %load solutions/03-ex1-solution.py
from sklearn.svm import SVR
svr_unscaled = SVR()
svr_unscaled.fit(X_train, y_train)
svr_unscaled.score(X_test, y_test)
svr_scaled = SVR()
svr_scaled.fit(X_train_scaled, y_train)
svr_scaled.score(X_test_scaled, y_test)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0, max_depth=3).fit(X_train, y_train)
tree.score(X_test, y_test)
tree_scaled = DecisionTreeClassifier(random_state=0, max_depth=3).fit(X_train_scaled, y_train)
tree_scaled.score(X_test_scaled, y_test)
from sklearn.tree import plot_tree
sns.reset_orig()
fig, ax = plt.subplots(figsize=(20, 10))
_ = plot_tree(tree, ax=ax, fontsize=16, feature_names=feature_names)
from sklearn.tree import plot_tree
sns.reset_orig()
fig, ax = plt.subplots(figsize=(20, 10))
_ = plot_tree(tree_scaled, ax=ax, fontsize=16, feature_names=feature_names)