# code adapted from https://github.com/thomasjpfan/ml-workshop-intro
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df

df=df.dropna()

df.isnull().sum()

df.columns

sklearn.set_config(display='diagram')

X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']


X = X.select_dtypes(include='number')
X

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Make pipeline!

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

knr = make_pipeline(
    StandardScaler(), KNeighborsRegressor()
)
knr.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsregressor', KNeighborsRegressor())])
StandardScaler()
KNeighborsRegressor()
knr.score(X_test, y_test)
-0.16833369709565682
from sklearn.preprocessing import PolynomialFeatures

knr_select = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(),
    KNeighborsRegressor()
)
knr_select.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures()),
                ('kneighborsregressor', KNeighborsRegressor())])
StandardScaler()
PolynomialFeatures()
KNeighborsRegressor()
knr_select.score(X_test, y_test)
-0.1635984889204516
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer(as_frame=True)

X, y = cancer.data, cancer.target

y.value_counts()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                   random_state=0)

from sklearn.linear_model import LogisticRegression

log_reg = make_pipeline(
    StandardScaler(),
    LogisticRegression(random_state=0)
)

log_reg.fit(X_train, y_train)

log_reg.score(X_test, y_test)

log_reg_poly = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(),
    LogisticRegression(random_state=0)
)

log_reg_poly.fit(X_train, y_train)

log_reg_poly.score(X_test, y_test)
0.965034965034965