Moving Fiscal Data from a sqlite db to a dask dataframe
This post includes code from Scalable-Data-Analysis-in-Python-with-Dask and coiled-examples.
import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
engine = db.create_engine('sqlite:///fiscal_data.db')
connection = engine.connect()
metadata = db.MetaData()
engine.execute("SELECT * FROM fiscal_data LIMIT 1").fetchall()
sql = """
SELECT year
, region
, province
, gdp
, fdi
, it
, specific
FROM fiscal_data
"""
cnxn = connection
df = pd.read_sql(sql, cnxn)
df
df.columns
df.gdp.hist()
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2,
n_workers=3, memory_limit='4GB')
client
from dask import dataframe as dd
ddf = dd.from_pandas(df, npartitions=5)
print(ddf)
ddf.npartitions
ddf.npartitions
len(ddf)
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2,
n_workers=3, memory_limit='4GB')
client
ddf.describe().compute()
ddf.columns
feat_list = ["year", "fdi"]
cat_feat_list = ["region", "province"]
target = ["gdp"]
ddf["year"] = ddf["year"].astype(float)
ddf["fdi"] = ddf["fdi"].astype(float)
#ddf["province"] = ddf["province"].astype(float)
#ddf["region"] = ddf["region"].astype(float)
ddf["gdp"] = ddf["gdp"].astype(float)
ddf["it"] = ddf["it"].astype(float)
type(target)
x=ddf[feat_list].persist()
y=ddf[target].persist()
x
y.compute()
print(x.shape,y.shape)
x.count().compute()
from dask_ml.xgboost import XGBRegressor
XGBR = XGBRegressor()
%%time
XGBR_model = XGBR.fit(x,y)
XGBR_model
client.close()