Model Evaluation
#code adapted from https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 20
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['lines.linewidth'] = 2.5
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams["savefig.dpi"] = 300
sklearn.set_config(display='diagram')
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X.head()
y.value_counts()
y = (y == 1).astype('int')
y.value_counts()
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, random_state=42)
log_reg = Pipeline([
('scaler', StandardScaler()),
('log_reg', LogisticRegression(random_state=42))])
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred
log_reg.score(X_test, y_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
# %load solutions/01-ex01-solutions.py
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))
Using probabilities
y_proba = log_reg.predict_proba(X_test)
y_proba[65:70]
y_pred[65:70]
y_pred_50 = y_proba[:, 1] > 0.5
print(classification_report(y_test, y_pred_50))
y_pred_25 = y_proba[:, 1] > 0.25
print(classification_report(y_test, y_pred_25))
y_pred_75 = y_proba[:, 1] > 0.75
print(classification_report(y_test, y_pred_75))
from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(log_reg, X_test, y_test, name="LogisticRegression")
from sklearn.metrics import plot_roc_curve
plot_roc_curve(log_reg, X_test, y_test, name="LogisticRegression")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
plot_precision_recall_curve(log_reg, X_test, y_test, name="LogisticRegression", ax=ax1)
plot_roc_curve(log_reg, X_test, y_test, name="LogisticRegression", ax=ax2)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
fig, ax = plt.subplots()
plot_precision_recall_curve(log_reg, X_test, y_test, ax=ax, name="Logistic Regression")
plot_precision_recall_curve(rf, X_test, y_test, ax=ax, name="Random Forest")
Exercise 2
- Plot the roc curve of the logistic regression model and the random forest model on the same axes.
- Train a
sklearn.dummy.DummyClassifier(strategy='prior')
on the training dataset and plot the precision recall curve and the roc curve with the test dataset. - What is the ROC AUC and the average precision for the dummy classifer?
- Extra: Compute the f1 score for three models we have trained so far. While model performs the best according to the f1 score? Hint:
f1_score
is insklearn.metrics
log_reg_decision = log_reg.decision_function(X_test)
np.all((log_reg_decision > 0) == log_reg.predict(X_test))
log_reg_pred = log_reg.predict_proba(X_test)
log_reg_pred
1/(1 + np.exp(-log_reg_decision))
log_reg_pred[:, 1]
from sklearn.metrics import average_precision_score
average_precision_score(y_test, log_reg_decision)
average_precision_score(y_test, log_reg_pred[:, 1])
# %load solutions/01-ex03-solutions.py
from sklearn.metrics import roc_auc_score
rf_proba = rf.predict_proba(X_test)
roc_auc_score(y_test, rf_proba[:, 1])
from sklearn.svm import SVC
svc = SVC(random_state=0)
svc.fit(X_train, y_train)
svc_decision = svc.decision_function(X_test)
roc_auc_score(y_test, svc_decision)
## Reading the dataset using pandas
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/CTG.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
sklearn.set_config(display='diagram')
df=df.dropna()
df.isnull().sum()
X, y = df.drop(['NSP', 'FileName', 'Date', 'SegFile'], axis = 1), df['NSP']
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, random_state=42)
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(rf, X_test, y_test, cmap='gray_r')
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
from sklearn.metrics import roc_auc_score
rf_y_pred_proba = rf.predict_proba(X_test)
roc_auc_score(y_test, rf_y_pred_proba, multi_class='ovo')
roc_auc_score(y_test, rf_y_pred_proba, multi_class='ovr')
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
df.columns
sklearn.set_config(display='diagram')
df=df.dropna()
df.isnull().sum()
X, y = df.drop(['specific', 'Unnamed: 0'], axis = 1), df['specific']
X = X.select_dtypes(include='number')
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
from sklearn.pipeline import make_pipeline
ridge = make_pipeline(StandardScaler(), Ridge())
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
ridge_pred = ridge.predict(X_test)
ridge_pred[:10]
ridge_r2 = r2_score(y_test, ridge_pred)
ridge_r2
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_mse
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mae
fig, ax = plt.subplots()
delta = y_test - ridge_pred
ax.plot(ridge_pred, delta, 'o', alpha=0.5)
ax.axhline(y=0, c='k', ls='--')
ax.set(xlabel='predicted', ylabel='y_true - predicited', aspect='equal');
fig, ax = plt.subplots()
ax.hist(delta, bins=30)
ax.set(xlabel="y_true - predicted", ylabel="Counts");
# %load solutions/01-ex04-solutions.py
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42).fit(X_train, y_train)
rf_pred = rf.predict(X_test)
r2_score(y_test, rf_pred)
mean_squared_error(y_test, rf_pred)
mean_absolute_error(y_test, rf_pred)
from sklearn.datasets import load_boston
import pandas as pd
X_df = pd.DataFrame(X, columns=boston.feature_names)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
ridge = make_pipeline(StandardScaler(), Ridge(random_state=42))
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
X_test.head()
X_analysis = X_test.assign(
delta=y_test - ridge_pred
)
import seaborn as sns
import matplotlib.pyplot as plt
columns = X_analysis.columns
n_features = X.shape[1]
fig, axes = plt.subplots(3, 5, figsize=(20, 10), constrained_layout=True)
for i, ax in enumerate(axes.ravel()):
if i >= n_features:
ax.set_visible(False)
continue
sns.scatterplot(x=columns[i], y='delta', ax=ax, data=X_analysis)
ax.axhline(y=0, c='k', ls='--')