This post includes code and notes from this gist and this post.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np

%load_ext sql

engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()

tables = pd.read_sql("""SELECT *
                        FROM sqlite_master
                        WHERE type='table';""", connection)
tables

%%sql
SELECT * 
FROM Match 
LIMIT 3;

Environment variable $DATABASE_URL not set, and no connect string given.
Connection info needed in SQLAlchemy format, example:
               postgresql://username:password@hostname/dbname
               or an existing connection: dict_keys([])

connection

<sqlalchemy.engine.base.Connection at 0x7f785f788c50>

match_wins = pd.read_sql("""SELECT *
                        FROM Match_Wins;""", connection)

# sql_query = %sql SELECT * FROM Match_Wins
# df = sql_query.DataFrame()
# df

match_wins

import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport

profile = ProfileReport(match_wins, title='Pandas Profiling Report')

profile.to_widgets()

profile.to_notebook_iframe()

profile.to_file(output_file="pandas_profiling.html")

match_wins.head()

cols = match_wins.columns 
colours = ['darkblue', 'red'] 
sns.heatmap(match_wins[cols].isnull(), cmap=sns.color_palette(colours))

<AxesSubplot:>

# top = match_wins["home_team_win"].describe()['top'] # impute with the most frequent value.
# match_wins["home_team_win"] = match_wins["home_team_win"].fillna(top)

pct_list = []
for col in match_wins.columns:
    pct_missing = np.mean(match_wins[col].isnull())
    if round(pct_missing*100) >0:
        pct_list.append([col, round(pct_missing*100)])
    print('{} - {}%'.format(col, round(pct_missing*100)))

id - 0%
country_name - 0%
league_name - 0%
season - 0%
stage - 0%
date - 0%
home_team - 0%
away_team - 0%
home_team_goal - 0%
away_team_goal - 0%
home_team_win - 0%

match_wins.country_name

0        Switzerland
1        Switzerland
2        Switzerland
3        Switzerland
4        Switzerland
            ...     
25974    Switzerland
25975    Switzerland
25976    Switzerland
25977    Switzerland
25978    Switzerland
Name: country_name, Length: 25979, dtype: object

# # extracting the titles from the names:
# Title = []
# for name in match_wins.country_name:
#     Title.append(name.split(",")[1].split(".")[0])
    
# match_wins["Team"] = Title

match_wins.groupby(["home_team", 'season'])['home_team_win'].agg(['sum']).round(0)

df = df.drop(columns = ["Name"])
df = df.drop(columns = ["PassengerId"])
df = df.drop(columns = ["Ticket"])

match_wins.dtypes

id                int64
country_name       int8
league_name        int8
season             int8
stage             int64
date              int16
home_team         int16
away_team         int16
home_team_goal    int64
away_team_goal    int64
home_team_win     int64
dtype: object

match_wins.country_name = pd.Categorical(match_wins.country_name)
match_wins.league_name = pd.Categorical(match_wins.league_name)
match_wins.season = pd.Categorical(match_wins.season)
match_wins.date = pd.Categorical(match_wins.date)

match_wins["country_name"] = match_wins.country_name.cat.codes

match_wins["league_name"] = match_wins.league_name.cat.codes
match_wins["season"] = match_wins.season.cat.codes
match_wins["date"] = match_wins.date.cat.codes

match_wins.home_team = pd.Categorical(match_wins.home_team)

match_wins.away_team = pd.Categorical(match_wins.away_team)

match_wins["away_team"] = match_wins.away_team.cat.codes

match_wins["home_team"] = match_wins.home_team.cat.codes

match_wins["home_team"]

0         24
1         72
2         84
3        173
4         76
        ... 
25974     76
25975    160
25976     84
25977     95
25978    100
Name: home_team, Length: 25979, dtype: int16

match_wins.date = pd.Categorical(match_wins.date)

match_wins["date"] = match_wins.date.cat.codes

match_wins

match_wins.dtypes

id                int64
country_name       int8
league_name        int8
season             int8
stage             int64
date              int16
home_team         int16
away_team         int16
home_team_goal    int64
away_team_goal    int64
home_team_win     int64
dtype: object

#match_wins = match_wins.drop(columns = ["Title"])
target = match_wins.home_team_win.values
match_wins = match_wins.drop(columns =["home_team_win"])

match_wins

target

array([0, 1, 0, ..., 0, 0, 1])

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(match_wins, target, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(x_train, y_train)

/home/gao/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:764: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

LogisticRegression()

LR.score(x_test, y_test)

0.9736335642802155

import shap
explainer = shap.LinearExplainer(LR, x_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)

shap.dependence_plot("home_team", shap_values, x_test)

shap.summary_plot(shap_values, x_train, plot_type="bar")

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, x_test, link="logit")

shap.plots.force is slow for many thousands of rows, try subsampling your data.

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], x_test.iloc[0,:], link="logit")

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[3,:], x_test.iloc[3,:], link="logit")

	type	name	tbl_name	rootpage	sql
0	table	sqlite_sequence	sqlite_sequence	4	CREATE TABLE sqlite_sequence(name,seq)
1	table	Player_Attributes	Player_Attributes	11	CREATE TABLE "Player_Attributes" (\n\t`id`\tIN...
2	table	Player	Player	14	CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA...
3	table	Match	Match	18	CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR...
4	table	League	League	24	CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA...
5	table	Country	Country	26	CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM...
6	table	Team	Team	29	CREATE TABLE "Team" (\n\t`id`\tINTEGER PRIMARY...
7	table	Team_Attributes	Team_Attributes	2	CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE...
8	table	Match_df	Match_df	3	CREATE TABLE Match_df(\n id INT,\n country_n...
9	table	Match_Wins	Match_Wins	308451	CREATE TABLE Match_Wins(\n id INT,\n country...

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal	home_team_win
0	24559	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2	0
1	24560	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1	1
2	24561	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2	0
3	24562	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2	0
4	24613	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0	1
...	...	...	...	...	...	...	...	...	...	...	...
25974	25945	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Basel	Grasshopper Club Zürich	0	1	0
25975	25946	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	Lugano	FC St. Gallen	3	0	1
25976	25947	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Luzern	FC Sion	2	2	0
25977	25948	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Thun	BSC Young Boys	0	3	0
25978	25949	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Zürich	FC Vaduz	3	1	1

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal	home_team_win
0	24559	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2	0
1	24560	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1	1
2	24561	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2	0
3	24562	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2	0
4	24613	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0	1
5	24614	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	AC Bellinzona	Neuchâtel Xamax	1	2	0
6	24615	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Zürich	FC Luzern	1	0	1
7	24616	Switzerland	Switzerland Super League	2008/2009	2	2008-07-24 00:00:00	FC Sion	BSC Young Boys	2	1	1
8	24617	Switzerland	Switzerland Super League	2008/2009	2	2008-07-24 00:00:00	FC Vaduz	FC Aarau	0	2	0
9	24668	Switzerland	Switzerland Super League	2008/2009	3	2008-07-26 00:00:00	FC Basel	AC Bellinzona	2	0	1

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal	home_team_win
25969	25940	Switzerland	Switzerland Super League	2015/2016	35	2016-05-22 00:00:00	Grasshopper Club Zürich	FC Thun	0	0	0
25970	25941	Switzerland	Switzerland Super League	2015/2016	35	2016-05-22 00:00:00	FC Sion	FC Zürich	2	2	0
25971	25942	Switzerland	Switzerland Super League	2015/2016	35	2016-05-22 00:00:00	FC Vaduz	Lugano	0	0	0
25972	25943	Switzerland	Switzerland Super League	2015/2016	35	2016-05-22 00:00:00	BSC Young Boys	FC Basel	2	3	0
25973	25944	Switzerland	Switzerland Super League	2015/2016	35	2016-05-22 00:00:00	FC St. Gallen	FC Luzern	1	4	0
25974	25945	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Basel	Grasshopper Club Zürich	0	1	0
25975	25946	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	Lugano	FC St. Gallen	3	0	1
25976	25947	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Luzern	FC Sion	2	2	0
25977	25948	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Thun	BSC Young Boys	0	3	0
25978	25949	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Zürich	FC Vaduz	3	1	1

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal	home_team_win
0	24559	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2	0
1	24560	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1	1
2	24561	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2	0
3	24562	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2	0
4	24613	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0	1

		sum
home_team	season
1. FC Kaiserslautern	2010/2011	6
1. FC Kaiserslautern	2011/2012	2
1. FC Köln	2008/2009	4
	2009/2010	3
	2010/2011	11
...	...	...
Śląsk Wrocław	2011/2012	9
	2012/2013	9
	2013/2014	5
	2014/2015	9
	2015/2016	5