Review of scikit-learn
#code adapted from https://github.com/thomasjpfan/ml-workshop-intro
import seaborn as sns
import sklearn
sns.set_theme(context="notebook", font_scale=1.2,
rc={"figure.figsize": [10, 6]})
sklearn.set_config(display="diagram")
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['it', 'Unnamed: 0'], axis = 1), df['it']
X = X.select_dtypes(include='number')
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df=df.dropna()
df.isnull().sum()
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['stroke', 'id'], axis = 1), df['stroke']
X = X.select_dtypes(include='number')
X
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
from sklearn.dummy import DummyClassifier
dc = DummyClassifier(strategy='prior').fit(X_train, y_train)
dc.score(X_test, y_test)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
knc = make_pipeline(
StandardScaler(),
KNeighborsClassifier()
)
knc.fit(X_train, y_train)
knc.score(X_test, y_test)
# %load solutions/00-ex01-solutions.py
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer(as_frame=True)
X, y = cancer.data, cancer.target
y.value_counts()
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, stratify=y)
from sklearn.linear_model import LogisticRegression
log_reg = make_pipeline(
StandardScaler(),
LogisticRegression()
)
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)
from sklearn.metrics import f1_score
y_pred = log_reg.predict(X_test)
y_pred
f1_score(y_test, y_pred)