Missing values in scikit-learn
#code adapted from https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2
from sklearn.impute import SimpleImputer
import numpy as np
import sklearn
sklearn.set_config(display='diagram')
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
df.columns
sklearn.set_config(display='diagram')
X, y = df.drop(['it', 'Unnamed: 0'], axis = 1), df['it']
X = X.select_dtypes(include='number')
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
df.isnull().sum()
imputer = SimpleImputer()
imputer.fit_transform(X)
df.isnull().sum()
imputer = SimpleImputer(add_indicator=True)
imputer.fit_transform(X)
df.isnull().sum()
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(X)
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)
import pandas as pd
imputer = SimpleImputer(strategy='constant', fill_value='sk_missing')
imputer.fit_transform(df)
df['a'] = df['a'].astype('category')
df
df.dtypes
imputer.fit_transform(df)
# %load solutions/03-ex01-solutions.py
from sklearn.datasets import fetch_openml
cancer = fetch_openml(data_id=15, as_frame=True)
X, y = cancer.data, cancer.target
X.shape
X.isna().sum()
imputer = SimpleImputer(add_indicator=True)
X_imputed = imputer.fit_transform(X)
X_imputed.shape
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, stratify=y
)
log_reg = make_pipeline(
SimpleImputer(add_indicator=True),
StandardScaler(),
LogisticRegression(random_state=0)
)
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hist = HistGradientBoostingClassifier(random_state=42)
hist.fit(X_train, y_train)
hist.score(X_test, y_test)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
iris = pd.read_csv('data/iris_w_missing.csv')
iris.head()
X = iris.drop('target', axis='columns')
y = iris['target']
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0, stratify=y
)
pipe = Pipeline([
('imputer', SimpleImputer(add_indicator=True)),
('rf', RandomForestClassifier(random_state=42))
])
pipe.get_params()
from sklearn.model_selection import GridSearchCV
params = {
'imputer__add_indicator': [True, False]
}
grid_search = GridSearchCV(pipe, param_grid=params, verbose=1)
grid_search.fit(X_train, y_train)
grid_search.best_params_
grid_search.best_score_
grid_search.score(X_test, y_test)
from sklearn.pipeline import make_pipeline
pipe2 = make_pipeline(SimpleImputer(add_indicator=True),
RandomForestClassifier(random_state=42))
pipe2.get_params()
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
params = {
'imputer': [
SimpleImputer(strategy='median', add_indicator=True),
SimpleImputer(strategy='mean', add_indicator=True),
KNNImputer(add_indicator=True),
IterativeImputer(estimator=RandomForestRegressor(random_state=42),
random_state=42, add_indicator=True)]
}
search_cv = GridSearchCV(pipe, param_grid=params, verbose=1, n_jobs=-1)
search_cv.fit(X_train, y_train)
search_cv.best_params_
search_cv.best_score_
search_cv.score(X_test, y_test)