Evaluating Distributions and generating experimental Crosstabs for the evaluation of Experiments
import numpy as np
import dask.array as da
import pandas as pd
import sqlalchemy as db
from sqlalchemy import create_engine
import sqlite3
import pandas as pd
import seaborn as sns
df = pd.read_csv('df_panel_fix.csv')
df_subset = df[["year", "reg", "province", "gdp", "fdi", 'it',"specific"]]
df_subset.columns = ["year", "region", "province", "gdp", "fdi", 'it',"specific"]
df=df_subset
df
sns.distplot(df['gdp'])
sns.distplot(df['fdi'])
sns.distplot(df['it'])
sns.distplot(df['specific'].dropna())
df.hist(column=['fdi'], bins=60)
import scipy.stats as stats
df['gdp_zscore'] = stats.zscore(df['gdp'])
df[abs(df['gdp_zscore'])>3].hist(column = ['gdp'])
df_no_gdp_outliers=df[abs(df['gdp_zscore'])<3]
df_no_gdp_outliers
df_no_gdp_outliers.hist(column=['gdp'], bins=60)
counts_fiscal=df.groupby('region').count()
counts_fiscal
counts_fiscal=df.groupby('province').count()
counts_fiscal
df_no_gdp_outliers.columns
df_no_gdp_outliers_subset = df_no_gdp_outliers[['region', 'gdp', 'fdi', 'it']]
df_no_gdp_outliers_subset
experimental_crosstab = df_no_gdp_outliers_subset.groupby('region').agg(['size', 'mean', 'std'])
experimental_crosstab.index
experimental_crosstab = experimental_crosstab.reset_index()
experimental_crosstab
experimental_crosstab.to_csv('fiscal_experimental_crosstab.csv')