EDA on Healthcare Data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/colab-notebooks/main/datasets/strokes_training.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
df.info()
df['stroke'].value_counts()
# labeled target is unbalanced
# Drop the id column
df.drop(columns=['id'], inplace=True)
# Showing records where patient suffered from stroke but had missing value in bmi attribute.
df[df['bmi'].isna() & df['stroke'] == 1]
# Replace the missing values with mean of bmi attribute
df['bmi'].fillna(np.round(df['bmi'].mean(), 1), inplace = True)
# Create a new category named 'not known'
df['smoking_status'].fillna('not known', inplace=True)
print(df['smoking_status'].value_counts())
# Discretize with respective equal-width bin
df['age_binned'] = pd.cut(df['age'], np.arange(0, 91, 5))
df['avg_glucose_level_binned'] = pd.cut(df['avg_glucose_level'], np.arange(0, 301, 10))
df['bmi_binned'] = pd.cut(df['bmi'], np.arange(0, 101, 5))
# Create the correlation heatmap
heatmap = sns.heatmap(df[['age_norm', 'avg_glucose_level_norm', 'bmi_norm']].corr(), vmin=-1, vmax=1, annot=True)
# Create the title
heatmap.set_title('Correlation Heatmap');
def get_stacked_bar_chart(column):
# Get the count of records by column and stroke
df_pct = df.groupby([column, 'stroke'])['age'].count()
# Create proper DataFrame's format
df_pct = df_pct.unstack()
return df_pct.plot.bar(stacked=True, figsize=(6,6), width=1);
def get_100_percent_stacked_bar_chart(column, width = 0.5):
# Get the count of records by column and stroke
df_breakdown = df.groupby([column, 'stroke'])['age'].count()
# Get the count of records by gender
df_total = df.groupby([column])['age'].count()
# Get the percentage for 100% stacked bar chart
df_pct = df_breakdown / df_total * 100
# Create proper DataFrame's format
df_pct = df_pct.unstack()
return df_pct.plot.bar(stacked=True, figsize=(6,6), width=width);
# Age related to risk
get_stacked_bar_chart('age_binned')
get_100_percent_stacked_bar_chart('age_binned', width = 0.9)
get_stacked_bar_chart('bmi_binned')
get_100_percent_stacked_bar_chart('bmi_binned', width = 0.9)
get_stacked_bar_chart('avg_glucose_level_binned')
get_100_percent_stacked_bar_chart('avg_glucose_level_binned', width = 0.9)
get_100_percent_stacked_bar_chart('hypertension')
get_100_percent_stacked_bar_chart('heart_disease')
get_100_percent_stacked_bar_chart('gender')
get_100_percent_stacked_bar_chart('Residence_type')
get_100_percent_stacked_bar_chart('work_type')
df.groupby(['work_type'])[['age']].agg(['count', 'mean'])
get_100_percent_stacked_bar_chart('ever_married')
df.groupby(['ever_married'])[['age']].agg(['count', 'mean'])
g = sns.catplot(x="Residence_type", hue="smoking_status", col="work_type",
data=df, kind="count",
height=4, aspect=.7)
missingno.matrix(df, figsize = (30,5))
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4,figsize=(25,7))
fig.suptitle("Countplot for the dataset", fontsize=35)
sns.countplot(x="gender", data=df,ax=ax1)
sns.countplot(x="stroke", data=df,ax=ax2)
sns.countplot(x="ever_married", data=df,ax=ax3)
sns.countplot(x="hypertension", data=df,ax=ax4)
sns.displot(x="age", data=df, kind="kde", hue="gender", col="smoking_status", row="Residence_type")
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(20,7))
fig.suptitle("Boxplot for Dataset", fontsize=35)
sns.boxplot(x="stroke", y="avg_glucose_level", data=df,ax=ax1)
sns.boxplot(x="stroke", y="bmi", data=df,ax=ax2)
sns.boxplot(x="stroke", y="age", data=df,ax=ax3)
# Compute a correlation matrix and convert to long-form
corr_mat = df.corr("kendall").stack().reset_index(name="correlation")
# Draw each cell as a scatter point with varying size and color
g = sns.relplot(
data=corr_mat,
x="level_0", y="level_1", hue="correlation", size="correlation",
palette="vlag", hue_norm=(-1, 1), edgecolor=".7",
height=5, sizes=(50, 250), size_norm=(-.2, .8),
)
# Tweak the figure to finalize
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(0.25)
for label in g.ax.get_xticklabels():
label.set_rotation(90)
for artist in g.legend.legendHandles:
artist.set_edgecolor(".1")
strokes_temp_df=df
strokes_temp_df[['stroke','hypertension']] = df[['stroke','hypertension']].astype('int')
corr = strokes_temp_df.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)