Pandas profiling and Shap values for European Soccer Match Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
%load_ext sql
engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()
tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table';""", connection)
tables
%%sql
SELECT *
FROM Match
LIMIT 3;
connection
match_wins = pd.read_sql("""SELECT *
FROM Match_Wins;""", connection)
# sql_query = %sql SELECT * FROM Match_Wins
# df = sql_query.DataFrame()
# df
match_wins
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
profile = ProfileReport(match_wins, title='Pandas Profiling Report')
profile.to_widgets()
profile.to_notebook_iframe()
profile.to_file(output_file="pandas_profiling.html")
match_wins.head()
cols = match_wins.columns
colours = ['darkblue', 'red']
sns.heatmap(match_wins[cols].isnull(), cmap=sns.color_palette(colours))
# top = match_wins["home_team_win"].describe()['top'] # impute with the most frequent value.
# match_wins["home_team_win"] = match_wins["home_team_win"].fillna(top)
pct_list = []
for col in match_wins.columns:
pct_missing = np.mean(match_wins[col].isnull())
if round(pct_missing*100) >0:
pct_list.append([col, round(pct_missing*100)])
print('{} - {}%'.format(col, round(pct_missing*100)))
match_wins.country_name
# # extracting the titles from the names:
# Title = []
# for name in match_wins.country_name:
# Title.append(name.split(",")[1].split(".")[0])
# match_wins["Team"] = Title
match_wins.groupby(["home_team", 'season'])['home_team_win'].agg(['sum']).round(0)
df = df.drop(columns = ["Name"])
df = df.drop(columns = ["PassengerId"])
df = df.drop(columns = ["Ticket"])
match_wins.dtypes
match_wins.country_name = pd.Categorical(match_wins.country_name)
match_wins.league_name = pd.Categorical(match_wins.league_name)
match_wins.season = pd.Categorical(match_wins.season)
match_wins.date = pd.Categorical(match_wins.date)
match_wins["country_name"] = match_wins.country_name.cat.codes
match_wins["league_name"] = match_wins.league_name.cat.codes
match_wins["season"] = match_wins.season.cat.codes
match_wins["date"] = match_wins.date.cat.codes
match_wins.home_team = pd.Categorical(match_wins.home_team)
match_wins.away_team = pd.Categorical(match_wins.away_team)
match_wins["away_team"] = match_wins.away_team.cat.codes
match_wins["home_team"] = match_wins.home_team.cat.codes
match_wins["home_team"]
match_wins.date = pd.Categorical(match_wins.date)
match_wins["date"] = match_wins.date.cat.codes
match_wins
match_wins.dtypes
#match_wins = match_wins.drop(columns = ["Title"])
target = match_wins.home_team_win.values
match_wins = match_wins.drop(columns =["home_team_win"])
match_wins
target
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(match_wins, target, test_size=0.2, random_state=0)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train, y_train)
LR.score(x_test, y_test)
import shap
explainer = shap.LinearExplainer(LR, x_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)
shap.dependence_plot("home_team", shap_values, x_test)
shap.summary_plot(shap_values, x_train, plot_type="bar")
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, x_test, link="logit")
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], x_test.iloc[0,:], link="logit")
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[3,:], x_test.iloc[3,:], link="logit")