Imbalanced data
#code adapted from https://github.com/thomasjpfan/ml-workshop-advanced
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = [12, 8]
sklearn.set_config(display='diagram')
# %load solutions/classifier_example.py
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
from sklearn.datasets import fetch_openml
np.bincount(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, random_state=0)
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
base_log_reg = LogisticRegression(random_state=42)
cv_results = cross_validate(base_log_reg,
X_train, y_train, scoring=['roc_auc', 'average_precision'])
cv_results
log_reg_base_auc = cv_results['test_roc_auc'].mean()
log_reg_base_auc
log_reg_base_ap = cv_results['test_average_precision'].mean()
log_reg_base_ap
def compute_metrics(estimator):
cv_results = cross_validate(estimator,
X_train, y_train, scoring=['roc_auc', 'average_precision'])
return {
"auc": cv_results["test_roc_auc"].mean(),
"average_precision": cv_results["test_average_precision"].mean(),
}
base_log_reg_metrics = compute_metrics(base_log_reg)
base_log_reg_metrics
from sklearn.ensemble import RandomForestClassifier
base_rf = RandomForestClassifier(random_state=42, n_jobs=-1)
base_rf_metrics = compute_metrics(base_rf)
base_rf_metrics
np.bincount(y_train)
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=42)
X_train_subsample, y_train_subsample = under_sampler.fit_resample(X_train, y_train)
X_train.shape
X_train_subsample.shape
np.bincount(y_train_subsample)
from imblearn.over_sampling import RandomOverSampler
over_sampler = RandomOverSampler(random_state=42)
X_train_subsample, y_train_subsample = over_sampler.fit_resample(X_train, y_train)
X_train_subsample.shape
np.bincount(y_train_subsample)
from imblearn.pipeline import make_pipeline as make_imb_pipeline
under_log_reg = make_imb_pipeline(
RandomUnderSampler(random_state=42), LogisticRegression(random_state=42))
base_log_reg_metrics
compute_metrics(under_log_reg)
under_rf = make_imb_pipeline(
RandomUnderSampler(random_state=42), RandomForestClassifier(random_state=42))
base_rf_metrics
compute_metrics(under_rf)
over_log_reg = make_imb_pipeline(
RandomOverSampler(), LogisticRegression(random_state=42))
base_log_reg_metrics
compute_metrics(over_log_reg)
# %load solutions/02-ex01-solutions.py
over_rf = make_imb_pipeline(
RandomOverSampler(),
RandomForestClassifier(random_state=42, n_jobs=-1)
)
base_rf_metrics
compute_metrics(over_rf)
base_log_reg.fit(X_train, y_train)
under_log_reg.fit(X_train, y_train)
over_log_reg.fit(X_train, y_train);
base_log_reg.score(X_test, y_test)
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_log_reg, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_log_reg, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_log_reg, X_test, y_test, ax=ax1, name="oversampling")
plot_precision_recall_curve(base_log_reg, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_log_reg, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_log_reg, X_test, y_test, ax=ax2, name="oversampling");
# %load solutions/02-ex02-solutions.py
base_rf.fit(X_train, y_train)
under_rf.fit(X_train, y_train)
over_rf.fit(X_train, y_train);
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_rf, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_rf, X_test, y_test, ax=ax1, name="oversampling")
plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_rf, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_rf, X_test, y_test, ax=ax2, name="oversampling");
class_weight_log_reg = LogisticRegression(class_weight='balanced', random_state=42)
class_weight_log_reg.fit(X_train, y_train)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_log_reg, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(class_weight_log_reg, X_test, y_test, ax=ax1, name="class-weighted")
plot_precision_recall_curve(base_log_reg, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(class_weight_log_reg, X_test, y_test, ax=ax2, name="class-weighted")
class_weight_rf = RandomForestClassifier(class_weight='balanced', random_state=42)
class_weight_rf.fit(X_train, y_train)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(class_weight_rf, X_test, y_test, ax=ax1, name="class-weighted")
plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(class_weight_rf, X_test, y_test, ax=ax2, name="class-weighted")
from imblearn.ensemble import BalancedRandomForestClassifier
balanced_rf = BalancedRandomForestClassifier(random_state=0)
balanced_rf.fit(X_train, y_train)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_rf, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_rf, X_test, y_test, ax=ax1, name="oversampling")
plot_roc_curve(balanced_rf, X_test, y_test, ax=ax1, name="balanced bagging")
plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_rf, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_rf, X_test, y_test, ax=ax2, name="oversampling");
plot_precision_recall_curve(balanced_rf, X_test, y_test, ax=ax2, name="balanced bagging")
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_smote.shape
np.bincount(y_train_smote)
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
sorting = np.argsort(y_train)
axes[0].set_title("Original")
axes[0].scatter(X_train.iloc[sorting, 3], X_train.iloc[sorting, 4], c=plt.cm.tab10(y_train.iloc[sorting]), alpha=.3, s=2)
axes[1].set_title("SMOTE")
axes[1].scatter(X_train_smote.iloc[:, 3], X_train_smote.iloc[:, 4], c=plt.cm.tab10(y_train_smote), alpha=.1, s=2)
cv_results
smote_log_reg = make_imb_pipeline(
SMOTE(random_state=42), LogisticRegression(random_state=42))
compute_metrics(smote_log_reg)
base_rf_metrics
smote_rf = make_imb_pipeline(SMOTE(random_state=42), RandomForestClassifier(random_state=42, n_jobs=-1))
compute_metrics(smote_rf)
smote_rf.fit(X_train, y_train)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_rf, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_rf, X_test, y_test, ax=ax1, name="oversampling")
plot_roc_curve(balanced_rf, X_test, y_test, ax=ax1, name="balanced bagging")
plot_roc_curve(smote_rf, X_test, y_test, ax=ax1, name="smote")
plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_rf, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_rf, X_test, y_test, ax=ax2, name="oversampling");
plot_precision_recall_curve(balanced_rf, X_test, y_test, ax=ax2, name="balanced bagging")
plot_precision_recall_curve(smote_rf, X_test, y_test, ax=ax2, name="smote")
# %load solutions/02-ex03-solutions.py
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
base_hist = HistGradientBoostingClassifier(random_state=42)
base_hist.fit(X_train, y_train)
smote_hist = make_imb_pipeline(
SMOTE(), HistGradientBoostingClassifier(random_state=42))
smote_hist.fit(X_train, y_train)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_hist, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(smote_hist, X_test, y_test, ax=ax1, name="smote")
plot_precision_recall_curve(base_hist, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(smote_hist, X_test, y_test, ax=ax2, name="smote")