import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
engine = db.create_engine('sqlite:///fiscal_data.db')
connection = engine.connect()
metadata = db.MetaData()
engine.execute("SELECT * FROM fiscal_data LIMIT 1").fetchall()
[(1996, 'East China', 'Anhui', '2093.3', 50661, 631930, 147002)]
sql = """
SELECT year
, region
, province
, gdp
, fdi
, it
, specific
FROM fiscal_data
"""

cnxn = connection
df = pd.read_sql(sql, cnxn)
df
year region province gdp fdi it specific
0 1996 East China Anhui 2093.3 50661 631930 147002.0
1 1997 East China Anhui 2347.32 43443 657860 151981.0
2 1998 East China Anhui 2542.96 27673 889463 174930.0
3 1999 East China Anhui 2712.34 26131 1227364 285324.0
4 2000 East China Anhui 2902.09 31847 1499110 195580.0
... ... ... ... ... ... ... ...
355 2003 East China Zhejiang 9705.02 498055 2261631 391292.0
356 2004 East China Zhejiang 11648.7 668128 3162299 656175.0
357 2005 East China Zhejiang 13417.68 772000 2370200 656175.0
358 2006 East China Zhejiang 15718.47 888935 2553268 1017303.0
359 2007 East China Zhejiang 18753.73 1036576 2939778 844647.0

360 rows × 7 columns

df.columns
Index(['year', 'region', 'province', 'gdp', 'fdi', 'it', 'specific'], dtype='object')
df.gdp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f41255eacc0>
from dask.distributed import Client

client = Client(processes=False, threads_per_worker=2,
                n_workers=3, memory_limit='4GB')
client
from dask import dataframe as dd 
ddf = dd.from_pandas(df, npartitions=5)
print(ddf)
ddf.npartitions
ddf.npartitions
len(ddf)
from dask.distributed import Client

client = Client(processes=False, threads_per_worker=2,
                n_workers=3, memory_limit='4GB')
client
ddf.describe().compute()
ddf.columns
feat_list = ["year", "fdi"]
cat_feat_list = ["region", "province"]
target = ["gdp"]
ddf["year"] = ddf["year"].astype(float)
ddf["fdi"] = ddf["fdi"].astype(float)
#ddf["province"] = ddf["province"].astype(float)
#ddf["region"] = ddf["region"].astype(float)
ddf["gdp"] = ddf["gdp"].astype(float)
ddf["it"] = ddf["it"].astype(float)
type(target)
x=ddf[feat_list].persist()
y=ddf[target].persist()
x
y.compute()
print(x.shape,y.shape)
x.count().compute()
from dask_ml.xgboost import XGBRegressor
XGBR = XGBRegressor()
%%time
XGBR_model = XGBR.fit(x,y)
XGBR_model
client.close()