import pandas as pd

#importing and looking at the file format
insurance_data = pd.read_csv("insurance.csv")
print(insurance_data.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

import pandas as pd

#importing and looking at the file format
insurance_data = pd.read_csv("insurance.csv")
print(insurance_data.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

# Verifying there are no empty/null values
insurance_data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

print(insurance_data.describe().round(2))

           age      bmi  children   charges
count  1338.00  1338.00   1338.00   1338.00
mean     39.21    30.66      1.09  13270.42
std      14.05     6.10      1.21  12110.01
min      18.00    15.96      0.00   1121.87
25%      27.00    26.30      0.00   4740.29
50%      39.00    30.40      1.00   9382.03
75%      51.00    34.69      2.00  16639.91
max      64.00    53.13      5.00  63770.43

# Average charges by categorical variables
print(insurance_data.groupby("sex")["charges"].mean().round(2))
print(insurance_data.groupby("smoker")["charges"].mean().round(2))
print(insurance_data.groupby("region")["charges"].mean().round(2))

sex
female    12569.58
male      13956.75
Name: charges, dtype: float64
smoker
no      8434.27
yes    32050.23
Name: charges, dtype: float64
region
northeast    13406.38
northwest    12417.58
southeast    14735.41
southwest    12346.94
Name: charges, dtype: float64

# BMI average by categorical variables
print(insurance_data.groupby("sex")["bmi"].mean().round(2))
print(insurance_data.groupby("smoker")["bmi"].mean().round(2))
print(insurance_data.groupby("region")["bmi"].mean().round(2))

sex
female    30.38
male      30.94
Name: bmi, dtype: float64
smoker
no     30.65
yes    30.71
Name: bmi, dtype: float64
region
northeast    29.17
northwest    29.20
southeast    33.36
southwest    30.60
Name: bmi, dtype: float64

# Visualization - importing libraries and creating heatmap of num correlations
import seaborn as sns
import matplotlib.pyplot as plt

correlations = insurance_data.corr(numeric_only=True)
plt.figure(figsize=(8,6))
sns.heatmap(correlations, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation heatmap")
plt.show()

# Scatterplot with regression line of statistically relevant numeric variables (age and BMI)
sns.lmplot(x="age", y="charges", data=insurance_data)
plt.title("Scatterplot of Age vs Charges")
plt.savefig("images/bmi_vs_charges.png", dpi=300, bbox_inches="tight") 
plt.show()

sns.lmplot(x="bmi", y="charges", data=insurance_data)
plt.title("Scatterplot of bmi vs Charges")
plt.show()

U.S. Medical Insurance Costs¶

1. Data Loading¶

2. Data Exploration¶

3. Data Cleaning¶

4. Data Analysis¶

5. Conclusions¶