import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
import seaborn as sns
df = pd.read_csv('df_panel_fix.csv')
df_subset = df[["year", "reg", "province", "gdp", "fdi", 'it',"specific"]]
df_subset.columns = ["year", "region", "province", "gdp", "fdi", 'it',"specific"]
df=df_subset
df
year region province gdp fdi it specific
0 1996 East China Anhui 2093.30 50661 631930 147002.0
1 1997 East China Anhui 2347.32 43443 657860 151981.0
2 1998 East China Anhui 2542.96 27673 889463 174930.0
3 1999 East China Anhui 2712.34 26131 1227364 285324.0
4 2000 East China Anhui 2902.09 31847 1499110 195580.0
... ... ... ... ... ... ... ...
355 2003 East China Zhejiang 9705.02 498055 2261631 391292.0
356 2004 East China Zhejiang 11648.70 668128 3162299 656175.0
357 2005 East China Zhejiang 13417.68 772000 2370200 656175.0
358 2006 East China Zhejiang 15718.47 888935 2553268 1017303.0
359 2007 East China Zhejiang 18753.73 1036576 2939778 844647.0

360 rows × 7 columns

Distributions of Dependant Variables

Right skew

sns.distplot(df['gdp'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f1043c20588>
sns.distplot(df['fdi'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f10437f74a8>
sns.distplot(df['it'])
<matplotlib.axes._subplots.AxesSubplot at 0x7f1043a09ef0>
sns.distplot(df['specific'].dropna())
<matplotlib.axes._subplots.AxesSubplot at 0x7f10439b7a20>
df.hist(column=['fdi'], bins=60)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f10439a19e8>]],
      dtype=object)

Removal of GDP value outliers more than 3 standard deviations away from the mean

outlier removal of rows with GDP values that are > 3 standard deviations away form the mean

import scipy.stats as stats
df['gdp_zscore'] = stats.zscore(df['gdp'])

these are the observations more then > 3 SDs away from the mean of gdp that will be dropped

df[abs(df['gdp_zscore'])>3].hist(column = ['gdp'])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f104364e0f0>]],
      dtype=object)
df_no_gdp_outliers=df[abs(df['gdp_zscore'])<3]
df_no_gdp_outliers
year region province gdp fdi it specific gdp_zscore
0 1996 East China Anhui 2093.30 50661 631930 147002.0 -0.521466
1 1997 East China Anhui 2347.32 43443 657860 151981.0 -0.464746
2 1998 East China Anhui 2542.96 27673 889463 174930.0 -0.421061
3 1999 East China Anhui 2712.34 26131 1227364 285324.0 -0.383239
4 2000 East China Anhui 2902.09 31847 1499110 195580.0 -0.340870
... ... ... ... ... ... ... ... ...
354 2002 East China Zhejiang 8003.67 307610 1962633 365437.0 0.798274
355 2003 East China Zhejiang 9705.02 498055 2261631 391292.0 1.178172
356 2004 East China Zhejiang 11648.70 668128 3162299 656175.0 1.612181
357 2005 East China Zhejiang 13417.68 772000 2370200 656175.0 2.007180
358 2006 East China Zhejiang 15718.47 888935 2553268 1017303.0 2.520929

350 rows × 8 columns

df_no_gdp_outliers.hist(column=['gdp'], bins=60)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f10429f5ba8>]],
      dtype=object)
counts_fiscal=df.groupby('region').count()
counts_fiscal
year province gdp fdi it specific gdp_zscore
region
East China 84 84 84 84 84 84 84
North China 48 48 48 48 48 47 48
Northeast China 36 36 36 36 36 36 36
Northwest China 60 60 60 60 60 60 60
South Central China 72 72 72 72 72 72 72
Southwest China 60 60 60 60 60 57 60
counts_fiscal=df.groupby('province').count()
counts_fiscal
year region gdp fdi it specific gdp_zscore
province
Anhui 12 12 12 12 12 12 12
Beijing 12 12 12 12 12 12 12
Chongqing 12 12 12 12 12 9 12
Fujian 12 12 12 12 12 12 12
Gansu 12 12 12 12 12 12 12
Guangdong 12 12 12 12 12 12 12
Guangxi 12 12 12 12 12 12 12
Guizhou 12 12 12 12 12 12 12
Hainan 12 12 12 12 12 12 12
Hebei 12 12 12 12 12 11 12
Heilongjiang 12 12 12 12 12 12 12
Henan 12 12 12 12 12 12 12
Hubei 12 12 12 12 12 12 12
Hunan 12 12 12 12 12 12 12
Jiangsu 12 12 12 12 12 12 12
Jiangxi 12 12 12 12 12 12 12
Jilin 12 12 12 12 12 12 12
Liaoning 12 12 12 12 12 12 12
Ningxia 12 12 12 12 12 12 12
Qinghai 12 12 12 12 12 12 12
Shaanxi 12 12 12 12 12 12 12
Shandong 12 12 12 12 12 12 12
Shanghai 12 12 12 12 12 12 12
Shanxi 12 12 12 12 12 12 12
Sichuan 12 12 12 12 12 12 12
Tianjin 12 12 12 12 12 12 12
Tibet 12 12 12 12 12 12 12
Xinjiang 12 12 12 12 12 12 12
Yunnan 12 12 12 12 12 12 12
Zhejiang 12 12 12 12 12 12 12

Subset by needed columns

df_no_gdp_outliers.columns
Index(['year', 'region', 'province', 'gdp', 'fdi', 'it', 'specific',
       'gdp_zscore'],
      dtype='object')
df_no_gdp_outliers_subset = df_no_gdp_outliers[['region', 'gdp', 'fdi', 'it']]
df_no_gdp_outliers_subset
region gdp fdi it
0 East China 2093.30 50661 631930
1 East China 2347.32 43443 657860
2 East China 2542.96 27673 889463
3 East China 2712.34 26131 1227364
4 East China 2902.09 31847 1499110
... ... ... ... ...
354 East China 8003.67 307610 1962633
355 East China 9705.02 498055 2261631
356 East China 11648.70 668128 3162299
357 East China 13417.68 772000 2370200
358 East China 15718.47 888935 2553268

350 rows × 4 columns

Genearate an experimental_crosstab to be used in statistical tests

experimental_crosstab = df_no_gdp_outliers_subset.groupby('region').agg(['size', 'mean', 'std'])
experimental_crosstab.index
Index(['East China', 'North China', 'Northeast China', 'Northwest China',
       'South Central China', 'Southwest China'],
      dtype='object', name='region')
experimental_crosstab = experimental_crosstab.reset_index()
experimental_crosstab
region gdp fdi it
size mean std size mean std size mean std
0 East China 78 6070.604231 3500.372702 78 355577.897436 275635.866746 78 1.775615e+06 1.153030e+06
1 North China 48 4239.038542 2866.705149 48 169600.583333 127011.475909 48 1.733719e+06 1.548794e+06
2 Northeast China 36 3849.076944 1948.531835 36 136623.750000 142734.495232 36 2.665148e+06 1.768442e+06
3 Northwest China 60 1340.026167 1174.399739 60 15111.133333 22954.193559 60 1.703538e+06 1.446408e+06
4 South Central China 68 4835.540882 3697.129915 68 218931.426471 339981.399823 68 2.500962e+06 2.196436e+06
5 Southwest China 60 2410.398833 2144.589994 60 25405.083333 31171.373876 60 2.424971e+06 2.002198e+06
experimental_crosstab.to_csv('fiscal_experimental_crosstab.csv')