Working with sqlite databases in Jupyter for Visualizing European Soccer Match Data
This post includes code and notes from data-analysis-using-sql.
import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
%load_ext sql
engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()
connection
tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table';""", connection)
tables
# %%sql
# SELECT *
# FROM sqlite_master
# WHERE type='table'
# ;
engine.execute("SELECT * FROM Country LIMIT 10").fetchall()
%%sql
SELECT *
FROM Match
LIMIT 3;
# %%sql
# DROP TABLE IF EXISTS Team_table
# CREATE TABLE Team_table AS
# SELECT * FROM Team_Attributes LIMIT 10;
sql_query = %sql SELECT * FROM Team_table LIMIT 10
df = sql_query.DataFrame()
df
countries = pd.read_sql("""SELECT *
FROM Country;""", connection)
countries.style.highlight_max()
# leagues = pd.read_sql("""SELECT *
# FROM League
# JOIN Country ON Country.id = League.country_id;""", connection)
# leagues
%%sql
DROP TABLE IF EXISTS Match_Table;
%sql SELECT * FROM Match LIMIT 1;
%sql SELECT * FROM Country LIMIT 1;
%sql SELECT * FROM League LIMIT 1;
%%sql
DROP TABLE IF EXISTS Match_df;
CREATE TABLE Match_df AS
SELECT Match.id,
Country.name AS country_name,
League.name AS league_name,
season,
stage,
date,
HT.team_long_name AS home_team,
AT.team_long_name AS away_team,
home_team_goal,
away_team_goal
FROM Match
JOIN Country on Country.id = Match.country_id
JOIN League on League.id = Match.league_id
LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
ORDER by date
;
%%sql
SELECT COUNT(*) FROM Match_df;
sql_query = %sql SELECT * FROM Match_df LIMIT 10
df = sql_query.DataFrame()
df
%%sql
DROP TABLE IF EXISTS Match_Wins;
CREATE TABLE Match_Wins AS
SELECT *
, CASE WHEN home_team_goal > away_team_goal
THEN 1
ELSE 0
END AS home_team_win
FROM Match_df
;
sql_query = %sql SELECT * FROM Match_Wins
df = sql_query.DataFrame()
df
from dask import dataframe as dd
ddf = dd.from_pandas(df, npartitions=5)
ddf.head()
df["home_team_goal"] = df["home_team_goal"].astype(float)
df["away_team_goal"] = df["away_team_goal"].astype(float)
df["stage"] = df["stage"].astype(float)
feat_list = [
"home_team_goal"
,"away_team_goal"
]
target = ['home_team_win']
X_train = ddf[feat_list].persist()
y_train = ddf[target].persist()
X_train.count().compute()
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2,
n_workers=3, memory_limit='4GB')
client
from dask_ml.xgboost import XGBRegressor
XGBR = XGBRegressor()
XGBR_model = XGBR.fit(X_train,y_train)
# from dask_ml.xgboost import XGBClassifier
# XGBC = XGBClassifier()
# XGBC_model = XGBC.fit(X_train,y_train)
client.close()
X, y = df.iloc[:, 1:10], df["home_team_win"]
X
X, y = ddf.iloc[:, 1:10], df["home_team_win"]
X
y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# import xgboost
# dtrain = xgboost.DMatrix(X_train, y_train)
# dtest = xgboost.DMatrix(X_test, y_test)