import pandas as pd

import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
#df


df.pivot_table(index='province', columns='year', values='it', aggfunc='mean').round(-1).style.highlight_max(color='blue').highlight_max(axis=1, color='green')

df_subset = df[["year", "reg", "province", "gdp", "fdi", 'it',"specific"]]
df_subset.columns = ["year", "region", "province", "gdp", "fdi", 'it',"specific"]

df=df_subset
df

# Add distributions by region
import matplotlib.pyplot as plt
#fig, axes = plt.subplots(nrows=3, ncols=3)

test_cells = ['East China', 'North China']
metrics = ['gdp', 'fdi', 'it']

for test_cell in test_cells:
    for metric in metrics:
        df.loc[df["region"] == test_cell].hist(column=[metric], bins=60)
        print(test_cell)
        print(metric)

df.hist(column=['fdi'], bins=60)

Distributions of Dependant Variables

Right skew

df.hist(column=['fdi'], bins=60)

sns.histplot(df['fdi'])

sns.displot(df['gdp'])

sns.displot(df['fdi'])

sns.displot(df['it'])

sns.displot(df['specific'].dropna())

df.hist(column=['fdi'], bins=60)

Removal of GDP value outliers more than 3 standard deviations away from the mean

outlier removal of rows with GDP values that are > 3 standard deviations away form the mean

import scipy.stats as stats

df['gdp_zscore'] = stats.zscore(df['gdp'])

these are the observations more then > 3 SDs away from the mean of gdp that will be dropped

df[abs(df['gdp_zscore'])>3].hist(column = ['gdp'])

df_no_gdp_outliers=df[abs(df['gdp_zscore'])<3]

df_no_gdp_outliers

df_no_gdp_outliers.hist(column=['gdp'], bins=60)

counts_fiscal=df.groupby('region').count()
counts_fiscal

counts_fiscal=df.groupby('province').count()
counts_fiscal

#df_no_gdp_outliers.pivot_table(index='grouping column 1', columns='grouping column 2', values='aggregating column', aggfunc='sum')

#pd.crosstab(df_no_gdp_outliers, 'year')

df_no_gdp_outliers_subset = df_no_gdp_outliers[['region', 'gdp', 'fdi', 'it']]
df_no_gdp_outliers_subset

def aggregate_and_ttest(dataset, groupby_feature='province', alpha=.05, test_cells = [0, 1]):
    #Imports
    from tqdm import tqdm
    from scipy.stats import ttest_ind_from_stats

    
    metrics = ['gdp', 'fdi', 'it']
    
    feature_size = 'size'
    feature_mean = 'mean'
    feature_std = 'std'    

    for metric in tqdm(metrics):
        
        #print(metric)
        crosstab = dataset.groupby(groupby_feature, as_index=False)[metric].agg(['size', 'mean', 'std'])
        print(crosstab)
        
        treatment = crosstab.index[test_cells[0]]
        control = crosstab.index[test_cells[1]]
        
        counts_control = crosstab.loc[control, feature_size]
        counts_treatment = crosstab.loc[treatment, feature_size]

        mean_control = crosstab.loc[control, feature_mean]
        mean_treatment = crosstab.loc[treatment, feature_mean]

        standard_deviation_control = crosstab.loc[control, feature_std]
        standard_deviation_treatment = crosstab.loc[treatment, feature_std]
        
        t_statistic, p_value = ttest_ind_from_stats(mean1=mean_treatment, std1=standard_deviation_treatment, nobs1=counts_treatment,mean2=mean_control,std2=standard_deviation_control,nobs2=counts_control)
        
        #fstring to print the p value and t statistic
        print(f"The t statistic of the comparison of the treatment test cell of {treatment} compared to the control test cell of {control} for the metric of {metric} is {t_statistic} and the p value is {p_value}.")
        
        #f string to say of the comparison is significant at a given alpha level

        if p_value < alpha: 
            print(f'The comparison between {treatment} and {control} is statistically significant at the threshold of {alpha}') 
        else: 
            print(f'The comparison between {treatment} and {control} is not statistically significant at the threshold of {alpha}')

aggregate_and_ttest(df_no_gdp_outliers, test_cells = [0,2])

EastvNorth=pd.DataFrame()
EastvNorth= aggregate_and_ttest(df_no_gdp_outliers_subset, test_cells = [0,1])
EastvNorth

import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats


test_1=df_no_gdp_outliers[df_no_gdp_outliers['province']=='Beijing']
test=test_1['gdp'].to_numpy()
test

control_1=df_no_gdp_outliers[df_no_gdp_outliers['province']=='Shanxi']
control=control_1['gdp'].to_numpy()
control

bins = np.linspace(0, 40, 20)

plt.hist(control, label='Control')
plt.hist(test, label='Test', color='orange')
plt.title('Test/Ctrl Data')
plt.legend()

bs.bootstrap_ab(test, control, stat_func=bs_stats.sum, compare_func=bs_compare.percent_change)

# run an a/b test simulation considering the lengths of the series (sum)
# consider the full 'volume' of values that are passed in

print(bs_compare.percent_change(test.sum(), control.sum()))

print(bs.bootstrap_ab(
    test, 
    control, 
    stat_func=bs_stats.sum,
    compare_func=bs_compare.percent_change
))

# run an a/b test simulation ignoring the lengths of the series (average)
# just what is the 'typical' value
# use percent change to compare test and control

print(bs_compare.difference(test.mean(), control.mean()))

print(bs.bootstrap_ab(test, control, bs_stats.mean, bs_compare.difference))