This post includes code and notes from this gist and this post.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
%load_ext sql
engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()
tables = pd.read_sql("""SELECT *
                        FROM sqlite_master
                        WHERE type='table';""", connection)
tables
type name tbl_name rootpage sql
0 table sqlite_sequence sqlite_sequence 4 CREATE TABLE sqlite_sequence(name,seq)
1 table Player_Attributes Player_Attributes 11 CREATE TABLE "Player_Attributes" (\n\t`id`\tIN...
2 table Player Player 14 CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA...
3 table Match Match 18 CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR...
4 table League League 24 CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA...
5 table Country Country 26 CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM...
6 table Team Team 29 CREATE TABLE "Team" (\n\t`id`\tINTEGER PRIMARY...
7 table Team_Attributes Team_Attributes 2 CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE...
8 table Match_df Match_df 3 CREATE TABLE Match_df(\n id INT,\n country_n...
9 table Match_Wins Match_Wins 308451 CREATE TABLE Match_Wins(\n id INT,\n country...
%%sql
SELECT * 
FROM Match 
LIMIT 3;
Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])
connection
<sqlalchemy.engine.base.Connection at 0x7f785f788c50>
match_wins = pd.read_sql("""SELECT *
                        FROM Match_Wins;""", connection)
# sql_query = %sql SELECT * FROM Match_Wins
# df = sql_query.DataFrame()
# df
match_wins
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
0 24559 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2 0
1 24560 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1 1
2 24561 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2 0
3 24562 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2 0
4 24613 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0 1
... ... ... ... ... ... ... ... ... ... ... ...
25974 25945 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Basel Grasshopper Club Zürich 0 1 0
25975 25946 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 Lugano FC St. Gallen 3 0 1
25976 25947 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Luzern FC Sion 2 2 0
25977 25948 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Thun BSC Young Boys 0 3 0
25978 25949 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Zürich FC Vaduz 3 1 1

25979 rows × 11 columns

import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport

profile = ProfileReport(match_wins, title='Pandas Profiling Report')
profile.to_widgets()

id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
0 24559 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2 0
1 24560 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1 1
2 24561 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2 0
3 24562 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2 0
4 24613 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0 1
5 24614 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 AC Bellinzona Neuchâtel Xamax 1 2 0
6 24615 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Zürich FC Luzern 1 0 1
7 24616 Switzerland Switzerland Super League 2008/2009 2 2008-07-24 00:00:00 FC Sion BSC Young Boys 2 1 1
8 24617 Switzerland Switzerland Super League 2008/2009 2 2008-07-24 00:00:00 FC Vaduz FC Aarau 0 2 0
9 24668 Switzerland Switzerland Super League 2008/2009 3 2008-07-26 00:00:00 FC Basel AC Bellinzona 2 0 1
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
25969 25940 Switzerland Switzerland Super League 2015/2016 35 2016-05-22 00:00:00 Grasshopper Club Zürich FC Thun 0 0 0
25970 25941 Switzerland Switzerland Super League 2015/2016 35 2016-05-22 00:00:00 FC Sion FC Zürich 2 2 0
25971 25942 Switzerland Switzerland Super League 2015/2016 35 2016-05-22 00:00:00 FC Vaduz Lugano 0 0 0
25972 25943 Switzerland Switzerland Super League 2015/2016 35 2016-05-22 00:00:00 BSC Young Boys FC Basel 2 3 0
25973 25944 Switzerland Switzerland Super League 2015/2016 35 2016-05-22 00:00:00 FC St. Gallen FC Luzern 1 4 0
25974 25945 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Basel Grasshopper Club Zürich 0 1 0
25975 25946 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 Lugano FC St. Gallen 3 0 1
25976 25947 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Luzern FC Sion 2 2 0
25977 25948 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Thun BSC Young Boys 0 3 0
25978 25949 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Zürich FC Vaduz 3 1 1
profile.to_notebook_iframe()

profile.to_file(output_file="pandas_profiling.html")

match_wins.head()
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
0 24559 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2 0
1 24560 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1 1
2 24561 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2 0
3 24562 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2 0
4 24613 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0 1
cols = match_wins.columns 
colours = ['darkblue', 'red'] 
sns.heatmap(match_wins[cols].isnull(), cmap=sns.color_palette(colours))
<AxesSubplot:>
# top = match_wins["home_team_win"].describe()['top'] # impute with the most frequent value.
# match_wins["home_team_win"] = match_wins["home_team_win"].fillna(top)
pct_list = []
for col in match_wins.columns:
    pct_missing = np.mean(match_wins[col].isnull())
    if round(pct_missing*100) >0:
        pct_list.append([col, round(pct_missing*100)])
    print('{} - {}%'.format(col, round(pct_missing*100)))
id - 0%
country_name - 0%
league_name - 0%
season - 0%
stage - 0%
date - 0%
home_team - 0%
away_team - 0%
home_team_goal - 0%
away_team_goal - 0%
home_team_win - 0%
match_wins.country_name
0        Switzerland
1        Switzerland
2        Switzerland
3        Switzerland
4        Switzerland
            ...     
25974    Switzerland
25975    Switzerland
25976    Switzerland
25977    Switzerland
25978    Switzerland
Name: country_name, Length: 25979, dtype: object
# # extracting the titles from the names:
# Title = []
# for name in match_wins.country_name:
#     Title.append(name.split(",")[1].split(".")[0])
    
# match_wins["Team"] = Title
match_wins.groupby(["home_team", 'season'])['home_team_win'].agg(['sum']).round(0)
sum
home_team season
1. FC Kaiserslautern 2010/2011 6
2011/2012 2
1. FC Köln 2008/2009 4
2009/2010 3
2010/2011 11
... ... ...
Śląsk Wrocław 2011/2012 9
2012/2013 9
2013/2014 5
2014/2015 9
2015/2016 5

1478 rows × 1 columns

df = df.drop(columns = ["Name"])
df = df.drop(columns = ["PassengerId"])
df = df.drop(columns = ["Ticket"])
match_wins.dtypes
id                int64
country_name       int8
league_name        int8
season             int8
stage             int64
date              int16
home_team         int16
away_team         int16
home_team_goal    int64
away_team_goal    int64
home_team_win     int64
dtype: object
match_wins.country_name = pd.Categorical(match_wins.country_name)
match_wins.league_name = pd.Categorical(match_wins.league_name)
match_wins.season = pd.Categorical(match_wins.season)
match_wins.date = pd.Categorical(match_wins.date)
match_wins["country_name"] = match_wins.country_name.cat.codes
match_wins["league_name"] = match_wins.league_name.cat.codes
match_wins["season"] = match_wins.season.cat.codes
match_wins["date"] = match_wins.date.cat.codes
match_wins.home_team = pd.Categorical(match_wins.home_team)
match_wins.away_team = pd.Categorical(match_wins.away_team)
match_wins["away_team"] = match_wins.away_team.cat.codes
match_wins["home_team"] = match_wins.home_team.cat.codes
match_wins["home_team"]
0         24
1         72
2         84
3        173
4         76
        ... 
25974     76
25975    160
25976     84
25977     95
25978    100
Name: home_team, Length: 25979, dtype: int16
match_wins.date = pd.Categorical(match_wins.date)
match_wins["date"] = match_wins.date.cat.codes
match_wins
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
0 24559 10 10 0 1 0 24 76 1 2 0
1 24560 10 10 0 1 1 72 91 3 1 1
2 24561 10 10 0 1 2 84 98 1 2 0
3 24562 10 10 0 1 2 173 100 1 2 0
4 24613 10 10 0 2 3 76 117 1 0 1
... ... ... ... ... ... ... ... ... ... ... ...
25974 25945 10 10 7 36 1693 76 117 0 1 0
25975 25946 10 10 7 36 1693 160 93 3 0 1
25976 25947 10 10 7 36 1693 84 91 2 2 0
25977 25948 10 10 7 36 1693 95 24 0 3 0
25978 25949 10 10 7 36 1693 100 98 3 1 1

25979 rows × 11 columns

match_wins.dtypes
id                int64
country_name       int8
league_name        int8
season             int8
stage             int64
date              int16
home_team         int16
away_team         int16
home_team_goal    int64
away_team_goal    int64
home_team_win     int64
dtype: object
#match_wins = match_wins.drop(columns = ["Title"])
target = match_wins.home_team_win.values
match_wins = match_wins.drop(columns =["home_team_win"])
match_wins
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal
0 24559 10 10 0 1 0 24 76 1 2
1 24560 10 10 0 1 1 72 91 3 1
2 24561 10 10 0 1 2 84 98 1 2
3 24562 10 10 0 1 2 173 100 1 2
4 24613 10 10 0 2 3 76 117 1 0
... ... ... ... ... ... ... ... ... ... ...
25974 25945 10 10 7 36 1693 76 117 0 1
25975 25946 10 10 7 36 1693 160 93 3 0
25976 25947 10 10 7 36 1693 84 91 2 2
25977 25948 10 10 7 36 1693 95 24 0 3
25978 25949 10 10 7 36 1693 100 98 3 1

25979 rows × 10 columns

target
array([0, 1, 0, ..., 0, 0, 1])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(match_wins, target, test_size=0.2, random_state=0)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train, y_train)
/home/gao/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
LogisticRegression()
LR.score(x_test, y_test)
0.9736335642802155
import shap
explainer = shap.LinearExplainer(LR, x_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)
shap.dependence_plot("home_team", shap_values, x_test)
shap.summary_plot(shap_values, x_train, plot_type="bar")
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, x_test, link="logit")
shap.plots.force is slow for many thousands of rows, try subsampling your data.
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], x_test.iloc[0,:], link="logit")
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[3,:], x_test.iloc[3,:], link="logit")
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.