Cross-Validation in scikit-learn example
#code adapted from https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2
import seaborn as sns
import sklearn
sns.set_theme(context="notebook", font_scale=1.2,
rc={"figure.figsize": [10, 6]})
sklearn.set_config(display="diagram")
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['it', 'Unnamed: 0'], axis = 1), df['it']
X = X.select_dtypes(include='number')
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="prior")
dummy_scores = cross_val_score(dummy_clf, X_train, y_train)
dummy_scores
dummy_scores.mean()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
knc = make_pipeline(StandardScaler(), KNeighborsClassifier())
knc_scores = cross_val_score(knc, X_train, y_train)
knc_scores
knc_scores.mean()
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
log_reg = make_pipeline(
StandardScaler(),
LogisticRegression(random_state=0)
)
log_reg_scores = cross_val_score(log_reg, X_train, y_train)
log_reg_scores
log_reg_scores.mean()
y.value_counts()
dummy_scores = cross_val_score(dummy_clf, X_train, y_train, scoring="roc_auc")
dummy_scores.mean()
knc_scores = cross_val_score(knc, X_train, y_train, scoring="roc_auc")
knc_scores.mean()
from sklearn.model_selection import KFold
cross_val_score(log_reg, X_train, y_train, cv=KFold(n_splits=4))
from sklearn.model_selection import RepeatedKFold
scores = cross_val_score(log_reg, X_train, y_train,
cv=RepeatedKFold(n_splits=4, n_repeats=2))
scores
scores.shape
from sklearn.model_selection import StratifiedKFold
scores = cross_val_score(log_reg, X_train, y_train,
cv=StratifiedKFold(n_splits=4))
scores
This is a binary classification problem:
y.value_counts()
Scikit-learn will use StratifiedKFold
by default:
cross_val_score(log_reg, X_train, y_train, cv=4)
from sklearn.model_selection import RepeatedStratifiedKFold
scores = cross_val_score(
log_reg, X_train, y_train,
cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=3))
scores
scores.shape
# %load solutions/01-ex02-solutions.py
from sklearn.model_selection import cross_validate
results = cross_validate(log_reg, X_train, y_train, cv=4)
results
import pandas as pd
pd.DataFrame(results)
more_results = cross_validate(log_reg, X_train, y_train, cv=4, scoring=["f1", "accuracy"])
pd.DataFrame(more_results)
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
X = np.arange(10)
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
With gap=2
: