Visualizing Operations with Dask Dataframes on Fiscal Data
This post includes code from Scalable-Data-Analysis-in-Python-with-Dask and coiled-examples.
import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
engine = db.create_engine("sqlite:///fiscal_data.db")
connection = engine.connect()
metadata = db.MetaData()
engine.execute("SELECT * FROM fiscal_data LIMIT 1").fetchall()
sql = """
SELECT year
, region
, province
, gdp
, fdi
, it
, specific
FROM fiscal_data
"""
cnxn = connection
df = pd.read_sql(sql, cnxn)
df
df.columns
df.gdp.hist()
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2,
n_workers=3, memory_limit='4GB')
client
from dask import dataframe as dd
ddf = dd.from_pandas(df, npartitions=5)
print(ddf)
ddf.describe().visualize(filename='describe.png')
ddf.head()
max_gdp_per_region = ddf.groupby('region')['gdp'].max()
max_gdp_per_region.visualize()
max_gdp_per_region.compute()
ddf
ddf.npartitions
ddf.npartitions
len(ddf)
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2, n_workers=3, memory_limit="4GB")
client
client.id
ddf.describe().compute()
ddf.columns
ddf["year"] = ddf["year"].astype(int)
ddf["fdi"] = ddf["fdi"].astype(float)
# ddf["province"] = ddf["province"].astype(float)
# ddf["region"] = ddf["region"].astype(float)
ddf["gdp"] = ddf["gdp"].astype(float)
ddf["it"] = ddf["it"].astype(float)
ddf
ddf.nlargest(20, 'gdp').compute()
without_ec = ddf[ddf.region !='East China']
without_ec.nlargest(20, 'gdp').compute()
ddf['province'].compute()
ddf.where(ddf['province']=='Zhejiang').compute()
mask_after_2010 = ddf.where(ddf['year']>2000)
mask_after_2010.compute()
def add_some_text(cname, *args, **kwargs):
return "Region name is " + cname
dummy_values = ddf['region'].apply(add_some_text, axis=1)
dummy_values
dummy_values.visualize()
dummy_values.compute()
max_per_region_yr = ddf.groupby('region').apply(lambda x: x.loc[x['gdp'].idxmax(), 'year'])
max_per_region_yr.visualize()
max_per_region_yr.compute()