python-data-analysis/Car_Insurance.py at main · azhar-fullstack/python-data-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# -------------------------
# Part 1: Load the Dataset
# -------------------------
df = pd.read_csv('Car_Insurance_Claim.csv')
print("Data Shape:", df.shape)

# -----------------------------
# Part 2: Basic Data Exploration
# -----------------------------
df.info()
print("\nDescriptive Statistics:")
print(df.describe(include='all'))

# -------------------------------
# Part 3: Data Cleaning & Prep
# -------------------------------
# 3.1 Check for duplicates
duplicate_rows = df.duplicated().sum()
print("\nNumber of duplicate rows:", duplicate_rows)

# 3.2 Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values:\n", missing_values)

# Convert relevant columns to categorical
categorical_cols = ['AGE','GENDER','RACE','DRIVING_EXPERIENCE',
                    'EDUCATION','INCOME','VEHICLE_TYPE']
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Convert binary flags to int
df['VEHICLE_OWNERSHIP'] = df['VEHICLE_OWNERSHIP'].astype(int)
df['MARRIED'] = df['MARRIED'].astype(int)
df['CHILDREN'] = df['CHILDREN'].astype(int)

# Ensure numeric columns are indeed numeric
numeric_cols = ['CREDIT_SCORE','ANNUAL_MILEAGE','SPEEDING_VIOLATIONS',
                'DUIS','PAST_ACCIDENTS','OUTCOME']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 3.3 Impute missing values with the mean
df['CREDIT_SCORE'] = df['CREDIT_SCORE'].fillna(df['CREDIT_SCORE'].mean())
df['ANNUAL_MILEAGE'] = df['ANNUAL_MILEAGE'].fillna(df['ANNUAL_MILEAGE'].mean())

df.info()  # Final check after cleaning

# -------------------------------------
# Part 4: Exploratory Data Analysis (EDA)
# -------------------------------------
# 4.1 Distribution of Key Numerical Variables
numeric_features = ['CREDIT_SCORE','ANNUAL_MILEAGE','SPEEDING_VIOLATIONS','DUIS','PAST_ACCIDENTS']
df[numeric_features].hist(figsize=(12, 8), bins=15, edgecolor='black')
plt.tight_layout()
plt.show()

# 4.2 Distribution of AGE Categories
plt.figure(figsize=(8,5))
sns.countplot(x='AGE', data=df, order=df['AGE'].value_counts().index)
plt.title('Distribution of AGE Categories')
plt.show()

# 4.3 Relationship Between AGE and OUTCOME
age_outcome = df.groupby('AGE')['OUTCOME'].mean().reset_index()
plt.figure(figsize=(8,5))
sns.barplot(x='AGE', y='OUTCOME', data=age_outcome, order=age_outcome['AGE'])
plt.title('Mean OUTCOME by AGE Category')
plt.xlabel('AGE Category')
plt.ylabel('Mean OUTCOME (Purchase/Claim Rate)')
plt.show()

# -----------------------------------------
# Part 5: Correlation and Pivot Table
# -----------------------------------------
plt.figure(figsize=(10,6))
corr_matrix = df[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()

pivot_table_example = df.pivot_table(values='OUTCOME', index='AGE', columns='GENDER', aggfunc='mean').fillna(0)
print("\nPivot Table - Mean OUTCOME by AGE & GENDER:\n", pivot_table_example)

# -----------------------------------------------
# Part 6: Basic Predictive Modeling (Logistic Reg)
# -----------------------------------------------
df_encoded = pd.get_dummies(df,
    columns=['AGE','GENDER','RACE','DRIVING_EXPERIENCE','EDUCATION','INCOME','VEHICLE_TYPE'],
    drop_first=True
)

X = df_encoded.drop(['ID','POSTAL_CODE','OUTCOME','VEHICLE_YEAR'], axis=1, errors='ignore')
y = df_encoded['OUTCOME']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -----------------------------------------
# Part 7: Conclusions & Recommendations
# -----------------------------------------
print("\nKey Observations:")
print("- Younger drivers (16-25) show a higher average OUTCOME rate.")
print("- Credit scores, speeding violations, DUIs, and past accidents all influence OUTCOME.")
print("- The pivot table indicates that male drivers in younger age brackets have higher OUTCOME rates.")

print("\nRecommendations:")
print("- Consider tailoring marketing or pricing strategies for younger age groups.")
print("- Further data on online interactions (e.g., time spent, quote frequency) would deepen the analysis.")