import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
import seaborn as sns

df = pd.read_csv('df_panel_fix.csv')

df_subset = df[["year", "reg", "province", "gdp", "fdi", 'it',"specific"]]
df_subset.columns = ["year", "region", "province", "gdp", "fdi", 'it',"specific"]

df=df_subset
df

Distributions of Dependant Variables

Right skew

sns.distplot(df['gdp'])

<matplotlib.axes._subplots.AxesSubplot at 0x7f1043c20588>

sns.distplot(df['fdi'])

<matplotlib.axes._subplots.AxesSubplot at 0x7f10437f74a8>

sns.distplot(df['it'])

<matplotlib.axes._subplots.AxesSubplot at 0x7f1043a09ef0>

sns.distplot(df['specific'].dropna())

<matplotlib.axes._subplots.AxesSubplot at 0x7f10439b7a20>

df.hist(column=['fdi'], bins=60)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f10439a19e8>]],
      dtype=object)

Removal of GDP value outliers more than 3 standard deviations away from the mean

outlier removal of rows with GDP values that are > 3 standard deviations away form the mean

import scipy.stats as stats

df['gdp_zscore'] = stats.zscore(df['gdp'])

these are the observations more then > 3 SDs away from the mean of gdp that will be dropped

df[abs(df['gdp_zscore'])>3].hist(column = ['gdp'])

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f104364e0f0>]],
      dtype=object)

df_no_gdp_outliers=df[abs(df['gdp_zscore'])<3]

df_no_gdp_outliers

df_no_gdp_outliers.hist(column=['gdp'], bins=60)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f10429f5ba8>]],
      dtype=object)

counts_fiscal=df.groupby('region').count()
counts_fiscal

counts_fiscal=df.groupby('province').count()
counts_fiscal

Subset by needed columns

df_no_gdp_outliers.columns

Index(['year', 'region', 'province', 'gdp', 'fdi', 'it', 'specific',
       'gdp_zscore'],
      dtype='object')

df_no_gdp_outliers_subset = df_no_gdp_outliers[['region', 'gdp', 'fdi', 'it']]
df_no_gdp_outliers_subset

Genearate an experimental_crosstab to be used in statistical tests

experimental_crosstab = df_no_gdp_outliers_subset.groupby('region').agg(['size', 'mean', 'std'])

experimental_crosstab.index

Index(['East China', 'North China', 'Northeast China', 'Northwest China',
       'South Central China', 'Southwest China'],
      dtype='object', name='region')

experimental_crosstab = experimental_crosstab.reset_index()

experimental_crosstab

experimental_crosstab.to_csv('fiscal_experimental_crosstab.csv')

	year	region	province	gdp	fdi	it	specific
0	1996	East China	Anhui	2093.30	50661	631930	147002.0
1	1997	East China	Anhui	2347.32	43443	657860	151981.0
2	1998	East China	Anhui	2542.96	27673	889463	174930.0
3	1999	East China	Anhui	2712.34	26131	1227364	285324.0
4	2000	East China	Anhui	2902.09	31847	1499110	195580.0
...	...	...	...	...	...	...	...
355	2003	East China	Zhejiang	9705.02	498055	2261631	391292.0
356	2004	East China	Zhejiang	11648.70	668128	3162299	656175.0
357	2005	East China	Zhejiang	13417.68	772000	2370200	656175.0
358	2006	East China	Zhejiang	15718.47	888935	2553268	1017303.0
359	2007	East China	Zhejiang	18753.73	1036576	2939778	844647.0

	year	region	province	gdp	fdi	it	specific	gdp_zscore
0	1996	East China	Anhui	2093.30	50661	631930	147002.0	-0.521466
1	1997	East China	Anhui	2347.32	43443	657860	151981.0	-0.464746
2	1998	East China	Anhui	2542.96	27673	889463	174930.0	-0.421061
3	1999	East China	Anhui	2712.34	26131	1227364	285324.0	-0.383239
4	2000	East China	Anhui	2902.09	31847	1499110	195580.0	-0.340870
...	...	...	...	...	...	...	...	...
354	2002	East China	Zhejiang	8003.67	307610	1962633	365437.0	0.798274
355	2003	East China	Zhejiang	9705.02	498055	2261631	391292.0	1.178172
356	2004	East China	Zhejiang	11648.70	668128	3162299	656175.0	1.612181
357	2005	East China	Zhejiang	13417.68	772000	2370200	656175.0	2.007180
358	2006	East China	Zhejiang	15718.47	888935	2553268	1017303.0	2.520929

	year	region	gdp	fdi	it	specific	gdp_zscore
province
Anhui	12	12	12	12	12	12	12
Beijing	12	12	12	12	12	12	12
Chongqing	12	12	12	12	12	9	12
Fujian	12	12	12	12	12	12	12
Gansu	12	12	12	12	12	12	12
Guangdong	12	12	12	12	12	12	12
Guangxi	12	12	12	12	12	12	12
Guizhou	12	12	12	12	12	12	12
Hainan	12	12	12	12	12	12	12
Hebei	12	12	12	12	12	11	12
Heilongjiang	12	12	12	12	12	12	12
Henan	12	12	12	12	12	12	12
Hubei	12	12	12	12	12	12	12
Hunan	12	12	12	12	12	12	12
Jiangsu	12	12	12	12	12	12	12
Jiangxi	12	12	12	12	12	12	12
Jilin	12	12	12	12	12	12	12
Liaoning	12	12	12	12	12	12	12
Ningxia	12	12	12	12	12	12	12
Qinghai	12	12	12	12	12	12	12
Shaanxi	12	12	12	12	12	12	12
Shandong	12	12	12	12	12	12	12
Shanghai	12	12	12	12	12	12	12
Shanxi	12	12	12	12	12	12	12
Sichuan	12	12	12	12	12	12	12
Tianjin	12	12	12	12	12	12	12
Tibet	12	12	12	12	12	12	12
Xinjiang	12	12	12	12	12	12	12
Yunnan	12	12	12	12	12	12	12
Zhejiang	12	12	12	12	12	12	12

	region	gdp	fdi	it
0	East China	2093.30	50661	631930
1	East China	2347.32	43443	657860
2	East China	2542.96	27673	889463
3	East China	2712.34	26131	1227364
4	East China	2902.09	31847	1499110
...	...	...	...	...
354	East China	8003.67	307610	1962633
355	East China	9705.02	498055	2261631
356	East China	11648.70	668128	3162299
357	East China	13417.68	772000	2370200
358	East China	15718.47	888935	2553268

	region	gdp			fdi			it
		size	mean	std	size	mean	std	size	mean	std
0	East China	78	6070.604231	3500.372702	78	355577.897436	275635.866746	78	1.775615e+06	1.153030e+06
1	North China	48	4239.038542	2866.705149	48	169600.583333	127011.475909	48	1.733719e+06	1.548794e+06
2	Northeast China	36	3849.076944	1948.531835	36	136623.750000	142734.495232	36	2.665148e+06	1.768442e+06
3	Northwest China	60	1340.026167	1174.399739	60	15111.133333	22954.193559	60	1.703538e+06	1.446408e+06
4	South Central China	68	4835.540882	3697.129915	68	218931.426471	339981.399823	68	2.500962e+06	2.196436e+06
5	Southwest China	60	2410.398833	2144.589994	60	25405.083333	31171.373876	60	2.424971e+06	2.002198e+06