EDA for Health Data (Pipeline Step 2)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
#df = df.drop(columns = ['id'])
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Pandas Profiling Report')
profile.to_widgets()
profile.to_notebook_iframe()
profile.to_file(output_file="pandas_profiling.html")
pct_list = []
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
if round(pct_missing*100) >0:
pct_list.append([col, round(pct_missing*100)])
print('{} - {}%'.format(col, round(pct_missing*100)))
cols = df.columns
colours = ['darkblue', 'red']
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))
df.groupby(["age", 'heart_disease'])['stroke'].agg(['sum']).round(0)
df.dtypes
# Discretize with respective equal-width bin
df['age_binned'] = pd.cut(df['age'], np.arange(0, 91, 5))
df['avg_glucose_level_binned'] = pd.cut(df['avg_glucose_level'], np.arange(0, 301, 10))
df['bmi_binned'] = pd.cut(df['bmi'], np.arange(0, 101, 5))
import seaborn as sns
# Create the correlation heatmap
heatmap = sns.heatmap(df[['age', 'avg_glucose_level', 'bmi']].corr(), vmin=-1, vmax=1, annot=True)
# Create the title
heatmap.set_title('Correlation Heatmap');
def get_stacked_bar_chart(column):
# Get the count of records by column and stroke
df_pct = df.groupby([column, 'stroke'])['age'].count()
# Create proper DataFrame's format
df_pct = df_pct.unstack()
return df_pct.plot.bar(stacked=True, figsize=(6,6), width=1);
def get_100_percent_stacked_bar_chart(column, width = 0.5):
# Get the count of records by column and stroke
df_breakdown = df.groupby([column, 'stroke'])['age'].count()
# Get the count of records by gender
df_total = df.groupby([column])['age'].count()
# Get the percentage for 100% stacked bar chart
df_pct = df_breakdown / df_total * 100
# Create proper DataFrame's format
df_pct = df_pct.unstack()
return df_pct.plot.bar(stacked=True, figsize=(6,6), width=width);
# Age related to risk
get_stacked_bar_chart('age_binned')
get_100_percent_stacked_bar_chart('age_binned', width = 0.9)
get_stacked_bar_chart('bmi_binned')
get_100_percent_stacked_bar_chart('bmi_binned', width = 0.9)
get_stacked_bar_chart('avg_glucose_level_binned')
get_100_percent_stacked_bar_chart('avg_glucose_level_binned', width = 0.9)
get_100_percent_stacked_bar_chart('hypertension')
get_100_percent_stacked_bar_chart('heart_disease')
get_100_percent_stacked_bar_chart('gender')
get_100_percent_stacked_bar_chart('Residence_type')
get_100_percent_stacked_bar_chart('work_type')
df.groupby(['work_type'])[['age']].agg(['count', 'mean'])
get_100_percent_stacked_bar_chart('ever_married')
df.groupby(['ever_married'])[['age']].agg(['count', 'mean'])
g = sns.catplot(x="Residence_type", hue="smoking_status", col="work_type",
data=df, kind="count",
height=4, aspect=.7)
import missingno
missingno.matrix(df, figsize = (30,5))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4,figsize=(25,7))
fig.suptitle("Countplot for the dataset", fontsize=35)
sns.countplot(x="gender", data=df,ax=ax1)
sns.countplot(x="stroke", data=df,ax=ax2)
sns.countplot(x="ever_married", data=df,ax=ax3)
sns.countplot(x="hypertension", data=df,ax=ax4)
sns.displot(x="age", data=df, kind="kde", hue="gender", col="smoking_status", row="Residence_type")
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(20,7))
fig.suptitle("Boxplot for Dataset", fontsize=35)
sns.boxplot(x="stroke", y="avg_glucose_level", data=df,ax=ax1)
sns.boxplot(x="stroke", y="bmi", data=df,ax=ax2)
sns.boxplot(x="stroke", y="age", data=df,ax=ax3)
# Compute a correlation matrix and convert to long-form
corr_mat = df.corr("kendall").stack().reset_index(name="correlation")
# Draw each cell as a scatter point with varying size and color
g = sns.relplot(
data=corr_mat,
x="level_0", y="level_1", hue="correlation", size="correlation",
palette="vlag", hue_norm=(-1, 1), edgecolor=".7",
height=5, sizes=(50, 250), size_norm=(-.2, .8),
)
# Tweak the figure to finalize
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(0.25)
for label in g.ax.get_xticklabels():
label.set_rotation(90)
for artist in g.legend.legendHandles:
artist.set_edgecolor(".1")
strokes_temp_df=df
strokes_temp_df[['stroke','hypertension']] = df[['stroke','hypertension']].astype('int')
corr = strokes_temp_df.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)