The Head of Data at On the Road car insurance has asked for your support as they venture into the world of machine learning! They would like you to start by investigating their customer data and cleaning it in preparation for modeling. Once that is complete, they would like you to tell them which feature produces the best accuracy for predicting whether a customer will make a car insurance claim. Specifically, they have set the following tasks:
Investigate and clean the data, so that there are no missing values and remove the “id” column. Find the feature with the best predictive performance for a car insurance claim (“outcome”) by creating simple Logistic Regression models (each with a single feature) and assessing their accuracy. Create a data frame called best_feature_df, containing columns named “best_feature” and “best_accuracy” with the name of the feature with the highest accuracy, and the respective accuracy score.
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(glue)
cars = read.csv("data/car_insurance.csv")
# View data types
str(cars)
## 'data.frame': 10000 obs. of 19 variables:
## $ id : int 569520 750365 199901 478866 731664 877557 930134 461006 68366 445911 ...
## $ age : int 3 0 0 0 1 2 3 1 2 2 ...
## $ gender : int 0 1 0 1 1 0 1 0 0 0 ...
## $ race : int 1 1 1 1 1 1 1 1 1 1 ...
## $ driving_experience : int 0 0 0 0 1 2 3 0 2 0 ...
## $ education : int 2 0 2 3 0 2 2 3 3 2 ...
## $ income : int 3 0 1 1 1 3 3 1 1 3 ...
## $ credit_score : num 0.629 0.358 0.493 0.206 0.388 ...
## $ vehicle_ownership : num 1 0 1 1 1 1 0 0 0 1 ...
## $ vehicle_year : int 1 0 0 0 0 1 1 1 0 0 ...
## $ married : num 0 0 0 0 0 0 1 0 1 0 ...
## $ children : num 1 0 0 1 0 1 1 1 0 1 ...
## $ postal_code : int 10238 10238 10238 32765 32765 10238 10238 10238 10238 32765 ...
## $ annual_mileage : num 12000 16000 11000 11000 12000 13000 13000 14000 13000 11000 ...
## $ vehicle_type : int 0 0 0 0 0 0 0 0 0 0 ...
## $ speeding_violations: int 0 0 0 0 2 3 7 0 0 0 ...
## $ duis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ past_accidents : int 0 0 0 0 1 3 3 0 0 0 ...
## $ outcome : num 0 1 0 0 1 0 0 1 0 1 ...
head(cars)
# Missing value per Column
colSums(is.na(cars))
## id age gender race
## 0 0 0 0
## driving_experience education income credit_score
## 0 0 0 982
## vehicle_ownership vehicle_year married children
## 0 0 0 0
## postal_code annual_mileage vehicle_type speeding_violations
## 0 957 0 0
## duis past_accidents outcome
## 0 0 0
# We have NA for 2 columns, let's see their distributions
# Distribution of credit_score
summary(cars$credit_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0534 0.4172 0.5250 0.5158 0.6183 0.9608 982
# Distribution of annual_mileage
summary(cars$annual_mileage)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2000 10000 12000 11697 14000 22000 957
# Distributions both of them looks Normal and we can use mean inputation
cars$credit_score[is.na(cars$credit_score)] = mean(cars$credit_score, na.rm = TRUE)
cars$annual_mileage[is.na(cars$annual_mileage)] = mean(cars$annual_mileage, na.rm = TRUE)
features_df = data.frame(
features = names(subset(cars, select = -c(id, outcome)))
)
# Calculating the accuracy for each variable
for (col in features_df$features) {
model = glm(glue('outcome ~ {col}'), data = cars, family = 'binomial')
predictions = round(fitted(model))
accuracy = sum(predictions == cars$outcome)/length(cars$outcome)
features_df[which(features_df$features == col), "accuracy"] = accuracy
# print(paste(accuracy, col))
}
features_df
# Find the feature with the Largest Accuracy
best_feature = features_df[which.max(features_df$accuracy),]
print(best_feature)
## features accuracy
## 4 driving_experience 0.7771