The Head of Data at On the Road car insurance has asked for your support as they venture into the world of machine learning! They would like you to start by investigating their customer data and cleaning it in preparation for modeling. Once that is complete, they would like you to tell them which feature produces the best accuracy for predicting whether a customer will make a car insurance claim. Specifically, they have set the following tasks:

Investigate and clean the data, so that there are no missing values and remove the “id” column. Find the feature with the best predictive performance for a car insurance claim (“outcome”) by creating simple Logistic Regression models (each with a single feature) and assessing their accuracy. Create a data frame called best_feature_df, containing columns named “best_feature” and “best_accuracy” with the name of the feature with the highest accuracy, and the respective accuracy score.

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(glue)
cars = read.csv("data/car_insurance.csv")

# View data types
str(cars)
## 'data.frame':    10000 obs. of  19 variables:
##  $ id                 : int  569520 750365 199901 478866 731664 877557 930134 461006 68366 445911 ...
##  $ age                : int  3 0 0 0 1 2 3 1 2 2 ...
##  $ gender             : int  0 1 0 1 1 0 1 0 0 0 ...
##  $ race               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ driving_experience : int  0 0 0 0 1 2 3 0 2 0 ...
##  $ education          : int  2 0 2 3 0 2 2 3 3 2 ...
##  $ income             : int  3 0 1 1 1 3 3 1 1 3 ...
##  $ credit_score       : num  0.629 0.358 0.493 0.206 0.388 ...
##  $ vehicle_ownership  : num  1 0 1 1 1 1 0 0 0 1 ...
##  $ vehicle_year       : int  1 0 0 0 0 1 1 1 0 0 ...
##  $ married            : num  0 0 0 0 0 0 1 0 1 0 ...
##  $ children           : num  1 0 0 1 0 1 1 1 0 1 ...
##  $ postal_code        : int  10238 10238 10238 32765 32765 10238 10238 10238 10238 32765 ...
##  $ annual_mileage     : num  12000 16000 11000 11000 12000 13000 13000 14000 13000 11000 ...
##  $ vehicle_type       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ speeding_violations: int  0 0 0 0 2 3 7 0 0 0 ...
##  $ duis               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ past_accidents     : int  0 0 0 0 1 3 3 0 0 0 ...
##  $ outcome            : num  0 1 0 0 1 0 0 1 0 1 ...
head(cars)
# Missing value per Column
colSums(is.na(cars))
##                  id                 age              gender                race 
##                   0                   0                   0                   0 
##  driving_experience           education              income        credit_score 
##                   0                   0                   0                 982 
##   vehicle_ownership        vehicle_year             married            children 
##                   0                   0                   0                   0 
##         postal_code      annual_mileage        vehicle_type speeding_violations 
##                   0                 957                   0                   0 
##                duis      past_accidents             outcome 
##                   0                   0                   0
# We have NA for 2 columns, let's see their distributions
# Distribution of credit_score
summary(cars$credit_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0534  0.4172  0.5250  0.5158  0.6183  0.9608     982
# Distribution of annual_mileage
summary(cars$annual_mileage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2000   10000   12000   11697   14000   22000     957
# Distributions both of them looks Normal and we can use mean inputation

cars$credit_score[is.na(cars$credit_score)] = mean(cars$credit_score, na.rm = TRUE)
cars$annual_mileage[is.na(cars$annual_mileage)] = mean(cars$annual_mileage, na.rm = TRUE)
features_df = data.frame(
  features = names(subset(cars, select = -c(id, outcome)))
)

# Calculating the accuracy for each variable
for (col in features_df$features) {
  model = glm(glue('outcome ~ {col}'), data = cars, family = 'binomial')
  predictions = round(fitted(model))
  accuracy = sum(predictions == cars$outcome)/length(cars$outcome)
  features_df[which(features_df$features == col), "accuracy"] = accuracy
#  print(paste(accuracy, col))
}

features_df
# Find the feature with the Largest Accuracy

best_feature = features_df[which.max(features_df$accuracy),]
print(best_feature)
##             features accuracy
## 4 driving_experience   0.7771