Model Inspection
#code adapted from https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
sns.set_theme(font_scale=1.2)
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams["savefig.dpi"] = 300
sklearn.set_config(display='diagram')
# %load solutions/regression_example.py
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
df.columns
sklearn.set_config(display='diagram')
df=df.dropna()
df.isnull().sum()
X, y = df.drop(['specific', 'Unnamed: 0'], axis = 1), df['specific']
X = X.select_dtypes(include='number')
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
X.head()
y.head()
import numpy as np
X = X.assign(ran_num=np.arange(0, X.shape[0]))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
ridge = Pipeline([
('scale', StandardScaler()),
('reg', Ridge())
])
ridge.fit(X_train, y_train)
ridge.score(X_train, y_train)
ridge.score(X_test, y_test)
import pandas as pd
import matplotlib.pyplot as plt
def plot_linear_coef(coefs, names, ax=None, sorted=False):
if ax is None:
fig, ax = plt.subplots()
coefs = pd.DataFrame(
coefs, columns=['Coefficients'],
index=names
)
if sorted:
coefs = coefs.sort_values(by='Coefficients')
coefs.plot(kind='barh', ax=ax)
ax.axvline(x=0, color='.5')
return ax
plot_linear_coef(ridge['reg'].coef_, names=X_train.columns, sorted=True);
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
ridges_cv = cross_validate(
ridge, X_train, y_train, cv=RepeatedKFold(n_splits=5, n_repeats=5),
return_estimator=True)
ridges_cv
ridge_coefs = pd.DataFrame(
[model['reg'].coef_ for model in ridges_cv['estimator']],
columns=X.columns
)
ridge_coefs.head()
fig, ax = plt.subplots()
_ = ax.boxplot(ridge_coefs, vert=False, labels=ridge_coefs.columns)
from sklearn.linear_model import Lasso
lasso = Pipeline([
('scale', StandardScaler()),
('reg', Lasso(alpha=0.06))
])
lasso.fit(X_train, y_train)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9))
plot_linear_coef(lasso['reg'].coef_, names=X_train.columns, sorted=True, ax=ax1);
plot_linear_coef(ridge['reg'].coef_, names=X_train.columns, sorted=True, ax=ax2);
lasso_cvs = cross_validate(
lasso, X_train, y_train, return_estimator=True, cv=RepeatedKFold(n_splits=5, n_repeats=5)
)
lasso_coefs = pd.DataFrame(
[model['reg'].coef_ for model in lasso_cvs['estimator']],
columns=X.columns
)
fig, ax = plt.subplots()
_ = ax.boxplot(lasso_coefs, vert=False, labels=ridge_coefs.columns)
# %load solutions/03-ex01-solutions.py
from sklearn.linear_model import Lasso
lasso = Pipeline([
('scale', StandardScaler()),
('reg', Lasso(random_state=42, alpha=0.04))
])
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
plot_linear_coef(ridge['reg'].coef_, X_train.columns, ax=ax1)
plot_linear_coef(lasso['reg'].coef_, X_train.columns, ax=ax2)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)
rf.score(X_test, y_test)
def plot_importances(importances, names, ax=None):
if ax is None:
fig, ax = plt.subplots()
indices = np.argsort(importances)
ax.barh(range(len(importances)), importances[indices])
ax.set(yticks=range(len(importances)),
yticklabels=np.array(names)[indices]);
importances = rf.feature_importances_
plot_importances(importances, X_train.columns);
Pay attention to ran_num
!
from sklearn.inspection import permutation_importance
rf_perm_results = permutation_importance(rf, X_test, y_test,
n_repeats=10, n_jobs=-1)
def plot_permutation_importance(perm_results, names, ax=None):
perm_sorted_idx = perm_results.importances_mean.argsort()
if ax is None:
fig, ax = plt.subplots()
_ = ax.boxplot(perm_results.importances[perm_sorted_idx].T, vert=False,
labels=np.array(names)[perm_sorted_idx])
return ax
_ = plot_permutation_importance(rf_perm_results, X_test.columns)
# %load solutions/classifier_example.py
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
from sklearn.inspection import permutation_importance
rf_result = permutation_importance(rf, X_train, y_train,
n_repeats=10, n_jobs=-1)
_ = plot_permutation_importance(rf_result, X)
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
corr = spearmanr(X_train).correlation
corr_linkage = hierarchy.ward(corr)
corr_linkage
from collections import defaultdict
cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
selected_features
from sklearn.inspection import permutation_importance
rf_sel_result = permutation_importance(
rf_sel, X_test, y_test, n_repeats=10, n_jobs=-1)
features_sel = data.feature_names[selected_features]
_ = plot_permutation_importance(rf_sel_result, features_sel)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hist = HistGradientBoostingClassifier(random_state=0)
hist.fit(X_train, y_train)
# %load solutions/03-ex03-solutions.py
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=0)
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
plot_importances(gb.feature_importances_, boston.feature_names)
gb_perm_results = permutation_importance(gb, X_test, y_test, n_repeats=10, n_jobs=-1)
plot_permutation_importance(gb_perm_results, boston.feature_names)
plot_partial_dependence(gb, X_test, features=["LSTAT", "RM", "DIS", "CRIM"],
feature_names=boston.feature_names, n_cols=2)
plot_partial_dependence(gb, X_test, features=[('LSTAT', 'RM')],
feature_names=boston.feature_names)