Working with sqlite databases in Jupyter for Visualizing European Soccer Match Data
This post includes code and notes from data-analysis-using-sql.
import sqlalchemy as db
import sqlite3
import pandas as pd
import numpy as np
engine = db.create_engine('sqlite:///database.sqlite')
connection = engine.connect()
metadata = db.MetaData()
connection
engine.execute("SELECT * FROM Country LIMIT 10").fetchall()
%load_ext sql
%sql sqlite:///database.sqlite
%%sql
SELECT *
FROM Country
LIMIT 10
%%sql
SELECT id
,name
FROM Country
WHERE name = "England"
%%sql
SELECT * FROM League LIMIT 10;
%%sql
SELECT * FROM Match LIMIT 10;
%%sql
SELECT * FROM Player LIMIT 10;
%%sql
SELECT * FROM Player_Attributes LIMIT 10;
%%sql
SELECT * FROM Team LIMIT 10;
%%sql
SELECT * FROM Team_Attributes LIMIT 10;
%%sql
CREATE TABLE Team_table AS
SELECT * FROM Team_Attributes LIMIT 10;
%%sql
DROP TABLE IF EXISTS Team_table
sql_query = %sql SELECT * FROM Team LIMIT 10
df = sql_query.DataFrame()
df
import matplotlib.pyplot as plt
plt.figure(figsize=(20,7))
plot = %sql SELECT team_short_name, count(*) FROM Team GROUP BY team_short_name ORDER BY team_short_name
plot.bar();
plot.pie();
type(plot)
# #Imports
# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import sqlite3
# import matplotlib.pyplot as plt
# # Input data files are available in the "../input/" directory.
# # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
# path = "../input/" #Insert path here
# database = path + 'database.sqlite'
First we will create the connection to the DB, and see what tables we have
tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table';""", connection)
tables
countries = pd.read_sql("""SELECT *
FROM Country;""", connection)
countries
leagues = pd.read_sql("""SELECT *
FROM League
JOIN Country ON Country.id = League.country_id;""", connection)
leagues
teams = pd.read_sql("""SELECT *
FROM Team
ORDER BY team_long_name
LIMIT 10;""", connection)
teams
detailed_matches = pd.read_sql("""SELECT Match.id,
Country.name AS country_name,
League.name AS league_name,
season,
stage,
date,
HT.team_long_name AS home_team,
AT.team_long_name AS away_team,
home_team_goal,
away_team_goal
FROM Match
JOIN Country on Country.id = Match.country_id
JOIN League on League.id = Match.league_id
LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
WHERE country_name = 'Spain'
ORDER by date
LIMIT 10;""", connection)
detailed_matches
# Star with Spain Real Madrid CF, barcelonat
leages_by_season = pd.read_sql("""SELECT Country.name AS country_name,
League.name AS league_name,
season,
count(distinct stage) AS number_of_stages,
count(distinct HT.team_long_name) AS number_of_teams,
avg(home_team_goal) AS avg_home_team_scors,
avg(away_team_goal) AS avg_away_team_goals,
avg(home_team_goal-away_team_goal) AS avg_goal_dif,
avg(home_team_goal+away_team_goal) AS avg_goals,
sum(home_team_goal+away_team_goal) AS total_goals
FROM Match
JOIN Country on Country.id = Match.country_id
JOIN League on League.id = Match.league_id
LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
WHERE country_name in ('Spain', 'Germany', 'France', 'Italy', 'England')
GROUP BY Country.name, League.name, season
HAVING count(distinct stage) > 10
ORDER BY Country.name, League.name, season DESC
;""", connection)
leages_by_season
df = pd.DataFrame(index=np.sort(leages_by_season['season'].unique()), columns=leages_by_season['country_name'].unique())
df.loc[:,'Germany'] = list(leages_by_season.loc[leages_by_season['country_name']=='Germany','avg_goals'])
df.loc[:,'Spain'] = list(leages_by_season.loc[leages_by_season['country_name']=='Spain','avg_goals'])
df.loc[:,'France'] = list(leages_by_season.loc[leages_by_season['country_name']=='France','avg_goals'])
df.loc[:,'Italy'] = list(leages_by_season.loc[leages_by_season['country_name']=='Italy','avg_goals'])
df.loc[:,'England'] = list(leages_by_season.loc[leages_by_season['country_name']=='England','avg_goals'])
df.plot(figsize=(12,5),title='Average Goals per Game Over Time')
df = pd.DataFrame(index=np.sort(leages_by_season['season'].unique()), columns=leages_by_season['country_name'].unique())
df.loc[:,'Germany'] = list(leages_by_season.loc[leages_by_season['country_name']=='Germany','avg_goal_dif'])
df.loc[:,'Spain'] = list(leages_by_season.loc[leages_by_season['country_name']=='Spain','avg_goal_dif'])
df.loc[:,'France'] = list(leages_by_season.loc[leages_by_season['country_name']=='France','avg_goal_dif'])
df.loc[:,'Italy'] = list(leages_by_season.loc[leages_by_season['country_name']=='Italy','avg_goal_dif'])
df.loc[:,'England'] = list(leages_by_season.loc[leages_by_season['country_name']=='England','avg_goal_dif'])
df.plot(figsize=(12,5),title='Average Goals Difference Home vs Out')
players_height = pd.read_sql("""SELECT CASE
WHEN ROUND(height)<165 then 165
WHEN ROUND(height)>195 then 195
ELSE ROUND(height)
END AS calc_height,
COUNT(height) AS distribution,
(avg(PA_Grouped.avg_overall_rating)) AS avg_overall_rating,
(avg(PA_Grouped.avg_potential)) AS avg_potential,
AVG(weight) AS avg_weight
FROM PLAYER
LEFT JOIN (SELECT Player_Attributes.player_api_id,
avg(Player_Attributes.overall_rating) AS avg_overall_rating,
avg(Player_Attributes.potential) AS avg_potential
FROM Player_Attributes
GROUP BY Player_Attributes.player_api_id)
AS PA_Grouped ON PLAYER.player_api_id = PA_Grouped.player_api_id
GROUP BY calc_height
ORDER BY calc_height
;""", connection)
players_height
players_height.calc_height
# players_height.plot(x=['calc_height'],y=['avg_overall_rating'],figsize=(12,5),title='Potential vs Height')