Groupby and Pivot Tables in Python
import pandas as pd
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
#df
df.pivot_table(index='province', columns='year', values='it', aggfunc='mean').round(-1).style.highlight_max(color='blue').highlight_max(axis=1, color='green')
df_subset = df[["year", "reg", "province", "gdp", "fdi", 'it',"specific"]]
df_subset.columns = ["year", "region", "province", "gdp", "fdi", 'it',"specific"]
df=df_subset
df
# Add distributions by region
import matplotlib.pyplot as plt
#fig, axes = plt.subplots(nrows=3, ncols=3)
test_cells = ['East China', 'North China']
metrics = ['gdp', 'fdi', 'it']
for test_cell in test_cells:
for metric in metrics:
df.loc[df["region"] == test_cell].hist(column=[metric], bins=60)
print(test_cell)
print(metric)
df.hist(column=['fdi'], bins=60)
df.hist(column=['fdi'], bins=60)
sns.histplot(df['fdi'])
sns.displot(df['gdp'])
sns.displot(df['fdi'])
sns.displot(df['it'])
sns.displot(df['specific'].dropna())
df.hist(column=['fdi'], bins=60)
import scipy.stats as stats
df['gdp_zscore'] = stats.zscore(df['gdp'])
df[abs(df['gdp_zscore'])>3].hist(column = ['gdp'])
df_no_gdp_outliers=df[abs(df['gdp_zscore'])<3]
df_no_gdp_outliers
df_no_gdp_outliers.hist(column=['gdp'], bins=60)
counts_fiscal=df.groupby('region').count()
counts_fiscal
counts_fiscal=df.groupby('province').count()
counts_fiscal
#df_no_gdp_outliers.pivot_table(index='grouping column 1', columns='grouping column 2', values='aggregating column', aggfunc='sum')
#pd.crosstab(df_no_gdp_outliers, 'year')
df_no_gdp_outliers_subset = df_no_gdp_outliers[['region', 'gdp', 'fdi', 'it']]
df_no_gdp_outliers_subset
def aggregate_and_ttest(dataset, groupby_feature='province', alpha=.05, test_cells = [0, 1]):
#Imports
from tqdm import tqdm
from scipy.stats import ttest_ind_from_stats
metrics = ['gdp', 'fdi', 'it']
feature_size = 'size'
feature_mean = 'mean'
feature_std = 'std'
for metric in tqdm(metrics):
#print(metric)
crosstab = dataset.groupby(groupby_feature, as_index=False)[metric].agg(['size', 'mean', 'std'])
print(crosstab)
treatment = crosstab.index[test_cells[0]]
control = crosstab.index[test_cells[1]]
counts_control = crosstab.loc[control, feature_size]
counts_treatment = crosstab.loc[treatment, feature_size]
mean_control = crosstab.loc[control, feature_mean]
mean_treatment = crosstab.loc[treatment, feature_mean]
standard_deviation_control = crosstab.loc[control, feature_std]
standard_deviation_treatment = crosstab.loc[treatment, feature_std]
t_statistic, p_value = ttest_ind_from_stats(mean1=mean_treatment, std1=standard_deviation_treatment, nobs1=counts_treatment,mean2=mean_control,std2=standard_deviation_control,nobs2=counts_control)
#fstring to print the p value and t statistic
print(f"The t statistic of the comparison of the treatment test cell of {treatment} compared to the control test cell of {control} for the metric of {metric} is {t_statistic} and the p value is {p_value}.")
#f string to say of the comparison is significant at a given alpha level
if p_value < alpha:
print(f'The comparison between {treatment} and {control} is statistically significant at the threshold of {alpha}')
else:
print(f'The comparison between {treatment} and {control} is not statistically significant at the threshold of {alpha}')
aggregate_and_ttest(df_no_gdp_outliers, test_cells = [0,2])
EastvNorth=pd.DataFrame()
EastvNorth= aggregate_and_ttest(df_no_gdp_outliers_subset, test_cells = [0,1])
EastvNorth
import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
test_1=df_no_gdp_outliers[df_no_gdp_outliers['province']=='Beijing']
test=test_1['gdp'].to_numpy()
test
control_1=df_no_gdp_outliers[df_no_gdp_outliers['province']=='Shanxi']
control=control_1['gdp'].to_numpy()
control
bins = np.linspace(0, 40, 20)
plt.hist(control, label='Control')
plt.hist(test, label='Test', color='orange')
plt.title('Test/Ctrl Data')
plt.legend()
bs.bootstrap_ab(test, control, stat_func=bs_stats.sum, compare_func=bs_compare.percent_change)
# run an a/b test simulation considering the lengths of the series (sum)
# consider the full 'volume' of values that are passed in
print(bs_compare.percent_change(test.sum(), control.sum()))
print(bs.bootstrap_ab(
test,
control,
stat_func=bs_stats.sum,
compare_func=bs_compare.percent_change
))
# run an a/b test simulation ignoring the lengths of the series (average)
# just what is the 'typical' value
# use percent change to compare test and control
print(bs_compare.difference(test.mean(), control.mean()))
print(bs.bootstrap_ab(test, control, bs_stats.mean, bs_compare.difference))