Using Dask and dask-sql
Dask SQL docs https://dask-sql.readthedocs.io/
dask-sql
import numpy as np
import pandas as pd
df = pd.read_csv('df_panel_fix.csv')
df
from dask import dataframe as dd
ddf = dd.from_pandas(df, npartitions=5)
ddf
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2,
n_workers=3, memory_limit='4GB')
client
# client.restart()
ddf.groupby("province").gdp.mean().compute()
from dask_sql import Context
c = Context()
ddf
c.register_dask_table(ddf, "fiscal")
result = c.sql('SELECT count(1) FROM fiscal')
result
result.compute()
result = c.sql("""
SELECT
province,
gdp,
fdi
FROM
fiscal AS fiscal
""")
result.compute()
print(result.compute())
from dask_sql import Context
from dask.datasets import timeseries
print(result.gdp.mean().compute())
result
%%time
ddf.groupby("province").fdi.mean().compute()
%%time
c.sql('SELECT avg(fdi) FROM fiscal GROUP BY province').compute()
dfp = ddf.persist()
import distributed
cached_tasks = distributed.wait(dfp)
print(f'cached {len(cached_tasks[0])} results')
c.register_dask_table(dfp, "fiscal_cached")
result = c.sql('SELECT count(1) FROM fiscal_cached')
result.compute()
%%time
c.sql('SELECT avg(fdi) FROM fiscal GROUP BY province').compute()
c.sql('SELECT floor(3.14)').compute()
%%time
c.sql("""
SELECT floor(fdi) AS fdi, avg(gdp) as gdp, count(1) as fiscal_count
FROM fiscal_cached
WHERE fdi > 50 AND gdp >= 0
GROUP BY floor(fdi)
""").compute()
And now we can run a query and immediately plot a visualization of the result using Pandas plotting syntax!
c.sql("""
SELECT floor(fdi) AS fdi, avg(gdp) as gdp
FROM fiscal_cached
WHERE fdi > 50 AND gdp >= 0
GROUP BY floor(fdi)
""").compute().plot(x='fdi', y='gdp')