Moving Dask XGboost with Fiscal Data, saving and loading Dask XGboost models
This post includes code from Scalable-Data-Analysis-in-Python-with-Dask and coiled-examples.
import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
engine = db.create_engine('sqlite:///fiscal.db')
connection = engine.connect()
metadata = db.MetaData()
engine.execute("SELECT * FROM fiscal_table LIMIT 10").fetchall()
sql = """
SELECT year
, region
, province
, gdp
, fdi
, it
, specific
FROM fiscal_table
"""
cnxn = connection
df = pd.read_sql(sql, cnxn)
df
df.columns
df.gdp.hist()
from dask.distributed import Client
client = Client(processes=False, threads_per_worker=2,
n_workers=3, memory_limit='4GB')
client
client.restart()
from dask import dataframe as dd
ddf = dd.from_pandas(df, npartitions=5)
print(ddf)
ddf.head()
client.id
feat_list = ["year", "fdi"]
cat_feat_list = ["region", "province"]
target = ["gdp"]
ddf["year"] = ddf["year"].astype(int)
ddf["fdi"] = ddf["fdi"].astype(float)
ddf["gdp"] = ddf["gdp"].astype(float)
ddf["it"] = ddf["it"].astype(float)
# ddf["province"] = ddf["province"].astype(float)
# ddf["region"] = ddf["region"].astype(float)
x=ddf[feat_list].persist()
y=ddf[target].persist()
x
y.compute()
print(x.shape,y.shape)
x.count().compute()
from dask_ml.xgboost import XGBRegressor
XGBR = XGBRegressor()
%%time
XGBR_model = XGBR.fit(x,y)
XGBR_model
XGBR_model.save_model('fiscal_model')
XGBR_model.load_model('fiscal_model')