This post includes code and notes from data-analysis-using-sql.

import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np

%load_ext sql

engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()

connection

<sqlalchemy.engine.base.Connection at 0x7f70c27cc0b8>

tables = pd.read_sql("""SELECT *
                        FROM sqlite_master
                        WHERE type='table';""", connection)
tables

# %%sql
# SELECT *
# FROM sqlite_master
# WHERE type='table'
# ;

engine.execute("SELECT * FROM Country LIMIT 10").fetchall()

[(1, 'Belgium'),
 (1729, 'England'),
 (4769, 'France'),
 (7809, 'Germany'),
 (10257, 'Italy'),
 (13274, 'Netherlands'),
 (15722, 'Poland'),
 (17642, 'Portugal'),
 (19694, 'Scotland'),
 (21518, 'Spain')]

%%sql
SELECT * 
FROM Match 
LIMIT 3;

 * sqlite:///database.sqlite
Done.

# %%sql
# DROP TABLE IF EXISTS Team_table
# CREATE TABLE Team_table AS
# SELECT * FROM Team_Attributes LIMIT 10;

 * sqlite:///database.sqlite
Done.

[]

sql_query = %sql SELECT * FROM Team_table LIMIT 10
df = sql_query.DataFrame()
df

 * sqlite:///database.sqlite
Done.

countries = pd.read_sql("""SELECT *
                        FROM Country;""", connection)
countries.style.highlight_max()

# leagues = pd.read_sql("""SELECT *
#                         FROM League
#                         JOIN Country ON Country.id = League.country_id;""", connection)
# leagues

%%sql
DROP TABLE IF EXISTS Match_Table;

 * sqlite:///database.sqlite
Done.

[]

%sql SELECT * FROM Match LIMIT 1;

 * sqlite:///database.sqlite
Done.

%sql SELECT * FROM Country LIMIT 1;

 * sqlite:///database.sqlite
Done.

%sql SELECT * FROM League LIMIT 1;

 * sqlite:///database.sqlite
Done.

%%sql
DROP TABLE IF EXISTS Match_df;
CREATE TABLE Match_df AS


SELECT Match.id, 
                                        Country.name AS country_name, 
                                        League.name AS league_name, 
                                        season, 
                                        stage, 
                                        date,
                                        HT.team_long_name AS  home_team,
                                        AT.team_long_name AS away_team,
                                        home_team_goal, 
                                        away_team_goal                                        
                                FROM Match
                                JOIN Country on Country.id = Match.country_id
                                JOIN League on League.id = Match.league_id
                                LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
                                LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
                                ORDER by date
;

 * sqlite:///database.sqlite
Done.
Done.

[]

%%sql
SELECT COUNT(*) FROM Match_df;

 * sqlite:///database.sqlite
Done.

sql_query = %sql SELECT * FROM Match_df LIMIT 10
df = sql_query.DataFrame()
df

 * sqlite:///database.sqlite
Done.

Build home team win label for classification

%%sql
DROP TABLE IF EXISTS Match_Wins;
CREATE TABLE Match_Wins AS


SELECT *
, CASE WHEN home_team_goal > away_team_goal
THEN 1
ELSE 0
END AS home_team_win
FROM Match_df
;

 * sqlite:///database.sqlite
Done.
Done.

[]

sql_query = %sql SELECT * FROM Match_Wins
df = sql_query.DataFrame()
df

 * sqlite:///database.sqlite
Done.

from dask import dataframe as dd 
ddf = dd.from_pandas(df, npartitions=5)

ddf.head()

df["home_team_goal"] = df["home_team_goal"].astype(float)
df["away_team_goal"] = df["away_team_goal"].astype(float)
df["stage"] = df["stage"].astype(float)

feat_list = [
    "home_team_goal"
    ,"away_team_goal"
]

target = ['home_team_win']

X_train = ddf[feat_list].persist()
y_train = ddf[target].persist()

X_train.count().compute()

home_team_goal    25979
away_team_goal    25979
dtype: int64

from dask.distributed import Client

client = Client(processes=False, threads_per_worker=2,
                n_workers=3, memory_limit='4GB')
client

from dask_ml.xgboost import XGBRegressor

XGBR = XGBRegressor()

XGBR_model = XGBR.fit(X_train,y_train)

# from dask_ml.xgboost import XGBClassifier

# XGBC = XGBClassifier()

# XGBC_model = XGBC.fit(X_train,y_train)

client.close()

X, y = df.iloc[:, 1:10], df["home_team_win"]
X

X, y = ddf.iloc[:, 1:10], df["home_team_win"]
X

y

0    0
1    1
2    0
3    0
4    1
5    0
6    1
7    1
8    0
9    1
Name: home_team_win, dtype: int64

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# import xgboost

# dtrain = xgboost.DMatrix(X_train, y_train)
# dtest = xgboost.DMatrix(X_test, y_test)

id	country_id	league_id	season	stage	date	match_api_id	home_team_api_id	away_team_api_id	home_team_goal	away_team_goal	home_player_X1	home_player_X2	home_player_X3	home_player_X4	home_player_X5	home_player_X6	home_player_X7	home_player_X8	home_player_X9	home_player_X10	home_player_X11	away_player_X1	away_player_X2	away_player_X3	away_player_X4	away_player_X5	away_player_X6	away_player_X7	away_player_X8	away_player_X9	away_player_X10	away_player_X11	home_player_Y1	home_player_Y2	home_player_Y3	home_player_Y4	home_player_Y5	home_player_Y6	home_player_Y7	home_player_Y8	home_player_Y9	home_player_Y10	home_player_Y11	away_player_Y1	away_player_Y2	away_player_Y3	away_player_Y4	away_player_Y5	away_player_Y6	away_player_Y7	away_player_Y8	away_player_Y9	away_player_Y10	away_player_Y11	home_player_1	home_player_2	home_player_3	home_player_4	home_player_5	home_player_6	home_player_7	home_player_8	home_player_9	home_player_10	home_player_11	away_player_1	away_player_2	away_player_3	away_player_4	away_player_5	away_player_6	away_player_7	away_player_8	away_player_9	away_player_10	away_player_11	goal	shoton	shotoff	foulcommit	card	cross	corner	possession	B365H	B365D	B365A	BWH	BWD	BWA	IWH	IWD	IWA	LBH	LBD	LBA	PSH	PSD	PSA	WHH	WHD	WHA	SJH	SJD	SJA	VCH	VCD	VCA	GBH	GBD	GBA	BSH	BSD	BSA
1	1	1	2008/2009	1	2008-08-17 00:00:00	492473	9987	9993	1	1	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	1.73	3.4	5	1.75	3.35	4.2	1.85	3.2	3.5	1.8	3.3	3.75	None	None	None	1.7	3.3	4.33	1.9	3.3	4	1.65	3.4	4.5	1.78	3.25	4	1.73	3.4	4.2
2	1	1	2008/2009	1	2008-08-16 00:00:00	492474	10000	9994	0	0	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	1.95	3.2	3.6	1.8	3.3	3.95	1.9	3.2	3.5	1.9	3.2	3.5	None	None	None	1.83	3.3	3.6	1.95	3.3	3.8	2	3.25	3.25	1.85	3.25	3.75	1.91	3.25	3.6
3	1	1	2008/2009	1	2008-08-16 00:00:00	492475	9984	8635	0	3	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	2.38	3.3	2.75	2.4	3.3	2.55	2.6	3.1	2.3	2.5	3.2	2.5	None	None	None	2.5	3.25	2.4	2.63	3.3	2.5	2.35	3.25	2.65	2.5	3.2	2.5	2.3	3.2	2.75

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal
0	24559	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2
1	24560	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1
2	24561	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2
3	24562	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2
4	24613	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0
5	24614	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	AC Bellinzona	Neuchâtel Xamax	1	2
6	24615	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Zürich	FC Luzern	1	0
7	24616	Switzerland	Switzerland Super League	2008/2009	2	2008-07-24 00:00:00	FC Sion	BSC Young Boys	2	1
8	24617	Switzerland	Switzerland Super League	2008/2009	2	2008-07-24 00:00:00	FC Vaduz	FC Aarau	0	2
9	24668	Switzerland	Switzerland Super League	2008/2009	3	2008-07-26 00:00:00	FC Basel	AC Bellinzona	2	0

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal	home_team_win
0	24559	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2	0
1	24560	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1	1
2	24561	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2	0
3	24562	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2	0
4	24613	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0	1
...	...	...	...	...	...	...	...	...	...	...	...
25974	25945	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Basel	Grasshopper Club Zürich	0	1	0
25975	25946	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	Lugano	FC St. Gallen	3	0	1
25976	25947	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Luzern	FC Sion	2	2	0
25977	25948	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Thun	BSC Young Boys	0	3	0
25978	25949	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Zürich	FC Vaduz	3	1	1

	id	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal	home_team_win
0	24559	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2	0
1	24560	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1	1
2	24561	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2	0
3	24562	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2	0
4	24613	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0	1

	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal
0	Switzerland	Switzerland Super League	2008/2009	1	2008-07-18 00:00:00	BSC Young Boys	FC Basel	1	2
1	Switzerland	Switzerland Super League	2008/2009	1	2008-07-19 00:00:00	FC Aarau	FC Sion	3	1
2	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	FC Luzern	FC Vaduz	1	2
3	Switzerland	Switzerland Super League	2008/2009	1	2008-07-20 00:00:00	Neuchâtel Xamax	FC Zürich	1	2
4	Switzerland	Switzerland Super League	2008/2009	2	2008-07-23 00:00:00	FC Basel	Grasshopper Club Zürich	1	0
...	...	...	...	...	...	...	...	...	...
25974	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Basel	Grasshopper Club Zürich	0	1
25975	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	Lugano	FC St. Gallen	3	0
25976	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Luzern	FC Sion	2	2
25977	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Thun	BSC Young Boys	0	3
25978	Switzerland	Switzerland Super League	2015/2016	36	2016-05-25 00:00:00	FC Zürich	FC Vaduz	3	1

Working with sqlite databases in Jupyter for Visualizing European Soccer Match Data

Build home team win label for classification

Client

Cluster

	type	name	tbl_name	rootpage	sql
0	table	sqlite_sequence	sqlite_sequence	4	CREATE TABLE sqlite_sequence(name,seq)
1	table	Player_Attributes	Player_Attributes	11	CREATE TABLE "Player_Attributes" (\n\t`id`\tIN...
2	table	Player	Player	14	CREATE TABLE `Player` (\n\t`id`\tINTEGER PRIMA...
3	table	Match	Match	18	CREATE TABLE `Match` (\n\t`id`\tINTEGER PRIMAR...
4	table	League	League	24	CREATE TABLE `League` (\n\t`id`\tINTEGER PRIMA...
5	table	Country	Country	26	CREATE TABLE `Country` (\n\t`id`\tINTEGER PRIM...
6	table	Team	Team	29	CREATE TABLE "Team" (\n\t`id`\tINTEGER PRIMARY...
7	table	Team_Attributes	Team_Attributes	2	CREATE TABLE `Team_Attributes` (\n\t`id`\tINTE...

	id	name
0	1	Belgium
1	1729	England
2	4769	France
3	7809	Germany
4	10257	Italy
5	13274	Netherlands
6	15722	Poland
7	17642	Portugal
8	19694	Scotland
9	21518	Spain
10	24558	Switzerland

	country_name	league_name	season	stage	date	home_team	away_team	home_team_goal	away_team_goal
npartitions=5
0	object	object	object	int64	object	object	object	int64	int64
5196	...	...	...	...	...	...	...	...	...
...	...	...	...	...	...	...	...	...	...
20784	...	...	...	...	...	...	...	...	...
25978	...	...	...	...	...	...	...	...	...