This post includes code and notes from data-analysis-using-sql.

import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
%load_ext sql
engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()
connection
<sqlalchemy.engine.base.Connection at 0x7f70c27cc0b8>
tables = pd.read_sql("""SELECT *
                        FROM sqlite_master
                        WHERE type='table';""", connection)
tables
type name tbl_name rootpage sql
0 table sqlite_sequence sqlite_sequence 4 CREATE TABLE sqlite_sequence(name,seq)
1 table Player_Attributes Player_Attributes 11 CREATE TABLE "Player_Attributes" (\n\t`id`\tIN...
2 table Player Player 14 CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA...
3 table Match Match 18 CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR...
4 table League League 24 CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA...
5 table Country Country 26 CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM...
6 table Team Team 29 CREATE TABLE "Team" (\n\t`id`\tINTEGER PRIMARY...
7 table Team_Attributes Team_Attributes 2 CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE...
# %%sql
# SELECT *
# FROM sqlite_master
# WHERE type='table'
# ;
engine.execute("SELECT * FROM Country LIMIT 10").fetchall()
[(1, 'Belgium'),
 (1729, 'England'),
 (4769, 'France'),
 (7809, 'Germany'),
 (10257, 'Italy'),
 (13274, 'Netherlands'),
 (15722, 'Poland'),
 (17642, 'Portugal'),
 (19694, 'Scotland'),
 (21518, 'Spain')]
%%sql
SELECT * 
FROM Match 
LIMIT 3;
 * sqlite:///database.sqlite
Done.
id country_id league_id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal home_player_X1 home_player_X2 home_player_X3 home_player_X4 home_player_X5 home_player_X6 home_player_X7 home_player_X8 home_player_X9 home_player_X10 home_player_X11 away_player_X1 away_player_X2 away_player_X3 away_player_X4 away_player_X5 away_player_X6 away_player_X7 away_player_X8 away_player_X9 away_player_X10 away_player_X11 home_player_Y1 home_player_Y2 home_player_Y3 home_player_Y4 home_player_Y5 home_player_Y6 home_player_Y7 home_player_Y8 home_player_Y9 home_player_Y10 home_player_Y11 away_player_Y1 away_player_Y2 away_player_Y3 away_player_Y4 away_player_Y5 away_player_Y6 away_player_Y7 away_player_Y8 away_player_Y9 away_player_Y10 away_player_Y11 home_player_1 home_player_2 home_player_3 home_player_4 home_player_5 home_player_6 home_player_7 home_player_8 home_player_9 home_player_10 home_player_11 away_player_1 away_player_2 away_player_3 away_player_4 away_player_5 away_player_6 away_player_7 away_player_8 away_player_9 away_player_10 away_player_11 goal shoton shotoff foulcommit card cross corner possession B365H B365D B365A BWH BWD BWA IWH IWD IWA LBH LBD LBA PSH PSD PSA WHH WHD WHA SJH SJD SJA VCH VCD VCA GBH GBD GBA BSH BSD BSA
1 1 1 2008/2009 1 2008-08-17 00:00:00 492473 9987 9993 1 1 None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None 1.73 3.4 5 1.75 3.35 4.2 1.85 3.2 3.5 1.8 3.3 3.75 None None None 1.7 3.3 4.33 1.9 3.3 4 1.65 3.4 4.5 1.78 3.25 4 1.73 3.4 4.2
2 1 1 2008/2009 1 2008-08-16 00:00:00 492474 10000 9994 0 0 None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None 1.95 3.2 3.6 1.8 3.3 3.95 1.9 3.2 3.5 1.9 3.2 3.5 None None None 1.83 3.3 3.6 1.95 3.3 3.8 2 3.25 3.25 1.85 3.25 3.75 1.91 3.25 3.6
3 1 1 2008/2009 1 2008-08-16 00:00:00 492475 9984 8635 0 3 None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None 2.38 3.3 2.75 2.4 3.3 2.55 2.6 3.1 2.3 2.5 3.2 2.5 None None None 2.5 3.25 2.4 2.63 3.3 2.5 2.35 3.25 2.65 2.5 3.2 2.5 2.3 3.2 2.75
# %%sql
# DROP TABLE IF EXISTS Team_table
# CREATE TABLE Team_table AS
# SELECT * FROM Team_Attributes LIMIT 10;
 * sqlite:///database.sqlite
Done.
[]
sql_query = %sql SELECT * FROM Team_table LIMIT 10
df = sql_query.DataFrame()
df
 * sqlite:///database.sqlite
Done.
countries = pd.read_sql("""SELECT *
                        FROM Country;""", connection)
countries.style.highlight_max()
id name
0 1 Belgium
1 1729 England
2 4769 France
3 7809 Germany
4 10257 Italy
5 13274 Netherlands
6 15722 Poland
7 17642 Portugal
8 19694 Scotland
9 21518 Spain
10 24558 Switzerland
# leagues = pd.read_sql("""SELECT *
#                         FROM League
#                         JOIN Country ON Country.id = League.country_id;""", connection)
# leagues
%%sql
DROP TABLE IF EXISTS Match_Table;
 * sqlite:///database.sqlite
Done.
[]
%sql SELECT * FROM Match LIMIT 1;
 * sqlite:///database.sqlite
Done.
id country_id league_id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal home_player_X1 home_player_X2 home_player_X3 home_player_X4 home_player_X5 home_player_X6 home_player_X7 home_player_X8 home_player_X9 home_player_X10 home_player_X11 away_player_X1 away_player_X2 away_player_X3 away_player_X4 away_player_X5 away_player_X6 away_player_X7 away_player_X8 away_player_X9 away_player_X10 away_player_X11 home_player_Y1 home_player_Y2 home_player_Y3 home_player_Y4 home_player_Y5 home_player_Y6 home_player_Y7 home_player_Y8 home_player_Y9 home_player_Y10 home_player_Y11 away_player_Y1 away_player_Y2 away_player_Y3 away_player_Y4 away_player_Y5 away_player_Y6 away_player_Y7 away_player_Y8 away_player_Y9 away_player_Y10 away_player_Y11 home_player_1 home_player_2 home_player_3 home_player_4 home_player_5 home_player_6 home_player_7 home_player_8 home_player_9 home_player_10 home_player_11 away_player_1 away_player_2 away_player_3 away_player_4 away_player_5 away_player_6 away_player_7 away_player_8 away_player_9 away_player_10 away_player_11 goal shoton shotoff foulcommit card cross corner possession B365H B365D B365A BWH BWD BWA IWH IWD IWA LBH LBD LBA PSH PSD PSA WHH WHD WHA SJH SJD SJA VCH VCD VCA GBH GBD GBA BSH BSD BSA
1 1 1 2008/2009 1 2008-08-17 00:00:00 492473 9987 9993 1 1 None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None None 1.73 3.4 5 1.75 3.35 4.2 1.85 3.2 3.5 1.8 3.3 3.75 None None None 1.7 3.3 4.33 1.9 3.3 4 1.65 3.4 4.5 1.78 3.25 4 1.73 3.4 4.2
%sql SELECT * FROM Country LIMIT 1;
 * sqlite:///database.sqlite
Done.
id name
1 Belgium
%sql SELECT * FROM League LIMIT 1;
 * sqlite:///database.sqlite
Done.
id country_id name
1 1 Belgium Jupiler League
%%sql
DROP TABLE IF EXISTS Match_df;
CREATE TABLE Match_df AS


SELECT Match.id, 
                                        Country.name AS country_name, 
                                        League.name AS league_name, 
                                        season, 
                                        stage, 
                                        date,
                                        HT.team_long_name AS  home_team,
                                        AT.team_long_name AS away_team,
                                        home_team_goal, 
                                        away_team_goal                                        
                                FROM Match
                                JOIN Country on Country.id = Match.country_id
                                JOIN League on League.id = Match.league_id
                                LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
                                LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
                                ORDER by date
;
 * sqlite:///database.sqlite
Done.
Done.
[]
%%sql
SELECT COUNT(*) FROM Match_df;
 * sqlite:///database.sqlite
Done.
COUNT(*)
25979
sql_query = %sql SELECT * FROM Match_df LIMIT 10
df = sql_query.DataFrame()
df
 * sqlite:///database.sqlite
Done.
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal
0 24559 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2
1 24560 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1
2 24561 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2
3 24562 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2
4 24613 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0
5 24614 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 AC Bellinzona Neuchâtel Xamax 1 2
6 24615 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Zürich FC Luzern 1 0
7 24616 Switzerland Switzerland Super League 2008/2009 2 2008-07-24 00:00:00 FC Sion BSC Young Boys 2 1
8 24617 Switzerland Switzerland Super League 2008/2009 2 2008-07-24 00:00:00 FC Vaduz FC Aarau 0 2
9 24668 Switzerland Switzerland Super League 2008/2009 3 2008-07-26 00:00:00 FC Basel AC Bellinzona 2 0

Build home team win label for classification

%%sql
DROP TABLE IF EXISTS Match_Wins;
CREATE TABLE Match_Wins AS


SELECT *
, CASE WHEN home_team_goal > away_team_goal
THEN 1
ELSE 0
END AS home_team_win
FROM Match_df
;
 * sqlite:///database.sqlite
Done.
Done.
[]
sql_query = %sql SELECT * FROM Match_Wins
df = sql_query.DataFrame()
df
 * sqlite:///database.sqlite
Done.
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
0 24559 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2 0
1 24560 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1 1
2 24561 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2 0
3 24562 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2 0
4 24613 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0 1
... ... ... ... ... ... ... ... ... ... ... ...
25974 25945 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Basel Grasshopper Club Zürich 0 1 0
25975 25946 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 Lugano FC St. Gallen 3 0 1
25976 25947 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Luzern FC Sion 2 2 0
25977 25948 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Thun BSC Young Boys 0 3 0
25978 25949 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Zürich FC Vaduz 3 1 1

25979 rows × 11 columns

from dask import dataframe as dd 
ddf = dd.from_pandas(df, npartitions=5)
ddf.head()
id country_name league_name season stage date home_team away_team home_team_goal away_team_goal home_team_win
0 24559 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2 0
1 24560 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1 1
2 24561 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2 0
3 24562 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2 0
4 24613 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0 1
df["home_team_goal"] = df["home_team_goal"].astype(float)
df["away_team_goal"] = df["away_team_goal"].astype(float)
df["stage"] = df["stage"].astype(float)
feat_list = [
    "home_team_goal"
    ,"away_team_goal"
]
target = ['home_team_win']
X_train = ddf[feat_list].persist()
y_train = ddf[target].persist()
X_train.count().compute()
home_team_goal    25979
away_team_goal    25979
dtype: int64
from dask.distributed import Client

client = Client(processes=False, threads_per_worker=2,
                n_workers=3, memory_limit='4GB')
client

Client

Cluster

  • Workers: 3
  • Cores: 6
  • Memory: 12.00 GB
from dask_ml.xgboost import XGBRegressor

XGBR = XGBRegressor()

XGBR_model = XGBR.fit(X_train,y_train)
# from dask_ml.xgboost import XGBClassifier

# XGBC = XGBClassifier()

# XGBC_model = XGBC.fit(X_train,y_train)
client.close()
X, y = df.iloc[:, 1:10], df["home_team_win"]
X
country_name league_name season stage date home_team away_team home_team_goal away_team_goal
0 Switzerland Switzerland Super League 2008/2009 1 2008-07-18 00:00:00 BSC Young Boys FC Basel 1 2
1 Switzerland Switzerland Super League 2008/2009 1 2008-07-19 00:00:00 FC Aarau FC Sion 3 1
2 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 FC Luzern FC Vaduz 1 2
3 Switzerland Switzerland Super League 2008/2009 1 2008-07-20 00:00:00 Neuchâtel Xamax FC Zürich 1 2
4 Switzerland Switzerland Super League 2008/2009 2 2008-07-23 00:00:00 FC Basel Grasshopper Club Zürich 1 0
... ... ... ... ... ... ... ... ... ...
25974 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Basel Grasshopper Club Zürich 0 1
25975 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 Lugano FC St. Gallen 3 0
25976 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Luzern FC Sion 2 2
25977 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Thun BSC Young Boys 0 3
25978 Switzerland Switzerland Super League 2015/2016 36 2016-05-25 00:00:00 FC Zürich FC Vaduz 3 1

25979 rows × 9 columns

X, y = ddf.iloc[:, 1:10], df["home_team_win"]
X
Dask DataFrame Structure:
country_name league_name season stage date home_team away_team home_team_goal away_team_goal
npartitions=5
0 object object object int64 object object object int64 int64
5196 ... ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ... ... ...
20784 ... ... ... ... ... ... ... ... ...
25978 ... ... ... ... ... ... ... ... ...
Dask Name: getitem, 10 tasks
y
0    0
1    1
2    0
3    0
4    1
5    0
6    1
7    1
8    0
9    1
Name: home_team_win, dtype: int64
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# import xgboost

# dtrain = xgboost.DMatrix(X_train, y_train)
# dtest = xgboost.DMatrix(X_test, y_test)