Poisson regression
#code from https://github.com/thomasjpfan/ml-workshop-advanced
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
sns.set_theme(font_scale=1.5, rc={'figure.figsize': [12, 8]})
sklearn.set_config(display='diagram')
from pathlib import Path
data_path = Path("data")
bikes_path = data_path / "london_bikes.csv"
- "timestamp" - timestamp field for grouping the data
- "cnt" - the count of a new bike shares
- "t1" - real temperature in C
- "t2" - temperature in C "feels like"
- "hum" - humidity in percentage
- "windspeed" - wind speed in km/h
- "weathercode" - category of the weather
- "isholiday" - boolean field - 1 holiday / 0 non holiday
- "isweekend" - boolean field - 1 if the day is weekend
- "season" - category field meteorological seasons: 0-spring ; 1-summer; 2-fall; 3-winter.
bikes = pd.read_csv(bikes_path, parse_dates=['timestamp'])
bikes.head()
bikes['timestamp'].head()
bikes['hr'] = bikes['timestamp'].dt.hour
bikes['weather_code'].unique()
bikes['season'].unique()
X = bikes[['t1', 't2', 'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season', 'hr']]
y = bikes['cnt']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
numerical_featurse = ['t1', 't2', 'hum', 'wind_speed', 'is_holiday', 'is_weekend', 'hr']
cat_features = ['weather_code', 'season']
ct = ColumnTransformer([
('numerical', 'passthrough', numerical_featurse),
('categorical', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_features)
])
from sklearn.linear_model import PoissonRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pois_reg = Pipeline([
('prep', ct),
('scaler', StandardScaler()),
('reg', PoissonRegressor())
])
pois_reg
pois_reg.fit(X_train, y_train);
pois_reg.score(X_test, y_test)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
numerical_featurse = ['t1', 't2', 'hum', 'wind_speed', 'is_holiday', 'is_weekend']
cat_features = ['weather_code', 'season', 'hr']
ct = ColumnTransformer([
('numerical', 'passthrough', numerical_featurse),
('categorical', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_features)
])
pois_reg = Pipeline([
('prep', ct),
('scalar', StandardScaler()),
('reg', PoissonRegressor())
])
pois_reg.fit(X_train, y_train)
pois_reg.score(X_test, y_test)
from sklearn.linear_model import Ridge
ct = ColumnTransformer([
('numerical', 'passthrough', numerical_featurse),
('categorical', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_features)
])
ridge = Pipeline([
('prep', ct),
('scalar', StandardScaler()),
('reg', Ridge(random_state=42))
])
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_poisson_deviance
def compute_metrics(y_true, y_pred, sample_weight=None):
mask = y_pred > 0
if (~mask).any():
n_masked, n_samples = (~mask).sum(), mask.shape[0]
print(f"WARNING: Estimator yields invalid, non-positive predictions "
f" for {n_masked} samples out of {n_samples}. These predictions "
f"are ignored when computing the Poisson deviance.")
y_true = y_true[mask]
y_pred = y_pred[mask]
if sample_weight is not None:
sample_weight = sample_weight[mask]
return {
'mse': mean_squared_error(y_true, y_pred, sample_weight=sample_weight),
'mean poisson deviance': mean_poisson_deviance(y_true, y_pred, sample_weight=sample_weight)
}
ridge_pred = ridge.predict(X_test)
compute_metrics(y_test, ridge_pred)
poisson_pred = pois_reg.predict(X_test)
compute_metrics(y_test, poisson_pred)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
ax1.hist(y_test, bins=30, alpha=0.5)
ax1.set_title("Test data")
ax2.hist(poisson_pred, bins=30, alpha=0.5)
ax2.set_title("Poisson predictions")
ax3.hist(ridge_pred, bins=30, alpha=0.5)
ax3.set_title("Ridge predictions")
# %load solutions/03-ex01-solutions.py
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
hist = HistGradientBoostingRegressor(random_state=42)
hist.fit(X_train, y_train)
hist_pred = hist.predict(X_test)
compute_metrics(y_test, hist_pred)
hist_poisson = HistGradientBoostingRegressor(loss='poisson', random_state=42)
hist_poisson.fit(X_train, y_train)
hist_poisson_pred = hist_poisson.predict(X_test)
compute_metrics(y_test, hist_poisson_pred)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
ax1.hist(y_test, bins=30, alpha=0.5)
ax1.set_title("Test data")
ax2.hist(hist_pred, bins=30, alpha=0.5)
ax2.set_title("Default Hist")
ax3.hist(hist_poisson_pred, bins=30, alpha=0.5)
ax3.set_title("Poisson Hist");
from sklearn.utils import gen_even_slices
def _calibration_curve_weighted(y_true, y_pred, n_bins=10, sample_weight=None):
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
idx_sort = np.argsort(y_pred)
y_pred_bin = np.zeros(n_bins)
y_true_bin = np.zeros(n_bins)
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
for i, sl in enumerate(gen_even_slices(len(y_true), n_bins)):
if sample_weight is None:
y_pred_bin[i] = np.average(y_pred[idx_sort][sl])
y_true_bin[i] = np.average(y_true[idx_sort][sl])
else:
weights = sample_weight[idx_sort][sl]
y_pred_bin[i] = np.average(y_pred[idx_sort][sl], weights=weights)
y_true_bin[i] = np.average(y_true[idx_sort][sl], weights=weights)
return y_pred_bin, y_true_bin
def plot_calibration_curve_weights(y_true, y_pred, n_bins=10, ax=None, title="", sample_weight=None):
if ax is None:
fig, ax = plt.subplots()
y_pred_bin, y_true_bin = _calibration_curve_weighted(y_test, y_pred, sample_weight=sample_weight)
bin_centers = np.arange(1, len(y_pred_bin) + 1)
ax.plot(bin_centers, y_pred_bin, marker='x', linestyle="--", label="predictions")
ax.plot(bin_centers, y_true_bin, marker='o', linestyle="--", label="observations")
ax.set(xlabel="Bin number", xticks=bin_centers, title=title)
ax.legend()
return ax
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))
plot_calibration_curve_weights(y_test, ridge_pred, ax=ax1, title="Ridge")
plot_calibration_curve_weights(y_test, poisson_pred, ax=ax2, title="Poisson Regression")
plot_calibration_curve_weights(y_test, hist_poisson_pred, ax=ax3, title="Hist Poisson");
claims_path = data_path / "claims.csv"
claims = pd.read_csv(claims_path)
- ClaimNb: number of claims on the given policy;
- Exposure: total exposure in yearly units;
- Area: area code (categorical, ordinal);
- VehPower: power of the car (categorical, ordinal);
- VehAge: age of the car in years;
- DrivAge: age of the (most common) driver in years;
- BonusMalus: bonus-malus level between 50 and 230 (with reference level 100);
- VehBrand: car brand (categorical, nominal);
- VehGas: diesel or regular fuel car (binary);
- Density: density of inhabitants per km2 in the city of the living place of the driver;
- Region: regions in France (prior to 2016)
claims.head()
exposure = claims['Exposure']
y = claims["ClaimNb"] / exposure
X = claims.drop(["Exposure", "ClaimNb"], axis="columns")
X.head()
X_train, X_test, y_train, y_test, exposure_train, exposure_test = train_test_split(
X, y, exposure, random_state=42)
from sklearn.dummy import DummyRegressor
dummy = DummyRegressor()
dummy.fit(X_train, y_train, sample_weight=exposure_train)
dummy_pred = dummy.predict(X_test)
compute_metrics(y_test, dummy_pred, sample_weight=exposure_test)
dummy_pred[:10]
X['Density'].hist(bins=20);
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
linear_model_preprocessor = ColumnTransformer(
[
("passthrough_numeric", "passthrough",
["BonusMalus"]),
("binned_numeric", KBinsDiscretizer(n_bins=10),
["VehAge", "DrivAge"]),
("log_scaled_numeric", FunctionTransformer(np.log, validate=False),
["Density"]),
("onehot_categorical", OneHotEncoder(handle_unknown='ignore'),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
],
)
linear_model_preprocessor.fit_transform(X_train)
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MaxAbsScaler
ridge = Pipeline([
("preprocessor", linear_model_preprocessor),
("scaler", MaxAbsScaler()),
("reg", Ridge(alpha=1e-6))])
ridge.fit(X_train, y_train, reg__sample_weight=exposure_train)
ridge_pred = ridge.predict(X_test)
compute_metrics(y_test, ridge_pred, sample_weight=exposure_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_calibration_curve_weights(y_test, ridge_pred, ax=ax, title="Ridge", sample_weight=exposure_test);
# %load solutions/03-ex02-solutions.py
poission_reg = Pipeline([
("preprocessor", linear_model_preprocessor),
("scaler", MaxAbsScaler()),
("reg", PoissonRegressor(alpha=1e-12))])
poission_reg.fit(X_train, y_train, reg__sample_weight=exposure_train)
poisson_pred = poission_reg.predict(X_test)
compute_metrics(y_test, poisson_pred, sample_weight=exposure_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_calibration_curve_weights(y_test, poisson_pred, ax=ax, title="Poisson", sample_weight=exposure_test)
# %load solutions/03-ex03-solutions.py
from sklearn.preprocessing import OrdinalEncoder
tree_preprocessor = ColumnTransformer(
[
("categorical", OrdinalEncoder(),
["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
("numeric", "passthrough",
["VehAge", "DrivAge", "BonusMalus", "Density"]),
]
)
hist_poisson = Pipeline([
("preprocessor", tree_preprocessor),
("reg", HistGradientBoostingRegressor(loss="poisson", random_state=0))
])
hist_poisson.fit(X_train, y_train, reg__sample_weight=exposure_train)
hist_poisson_pred = hist_poisson.predict(X_test)
compute_metrics(y_test, hist_poisson_pred, sample_weight=exposure_test)
fig, ax = plt.subplots(figsize=(8, 8))
plot_calibration_curve_weights(y_test, hist_poisson_pred, ax=ax, title="Hist Poisson", sample_weight=exposure_test);