sklearn-pipelines-example
# code adapted from https://github.com/thomasjpfan/ml-workshop-intro
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
knr = make_pipeline(
StandardScaler(), KNeighborsRegressor()
)
knr.fit(X_train, y_train)
knr.score(X_test, y_test)
from sklearn.preprocessing import PolynomialFeatures
knr_select = make_pipeline(
StandardScaler(),
PolynomialFeatures(),
KNeighborsRegressor()
)
knr_select.fit(X_train, y_train)
knr_select.score(X_test, y_test)
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer(as_frame=True)
X, y = cancer.data, cancer.target
y.value_counts()
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
random_state=0)
from sklearn.linear_model import LogisticRegression
log_reg = make_pipeline(
StandardScaler(),
LogisticRegression(random_state=0)
)
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)
log_reg_poly = make_pipeline(
StandardScaler(),
PolynomialFeatures(),
LogisticRegression(random_state=0)
)
log_reg_poly.fit(X_train, y_train)
log_reg_poly.score(X_test, y_test)