Description

The dataset Default is taken from the ISLR package.

dim(Default)

## [1] 10000     4

head(Default,4)

The below code constructs a training and testing set using a 70% (train) 30% (test) split.

n.full <- nrow(Default)
n.full

## [1] 10000

n.test <- floor(.3*n.full)
n.test

## [1] 3000

set.seed(0) #use seed 0 
test.index <- sample(1:n.full,n.test)
test.data <- Default[test.index,] 
train.data <- Default[-test.index,] 
n.train <- nrow(train.data)
n.train

## [1] 7000

mean(train.data$student=="Yes" & train.data$default=="Yes")

## [1] 0.01357143

mean(train.data$default=="Yes")

## [1] 0.03157143

mean(test.data$student=="Yes" & test.data$default=="Yes")

## [1] 0.01066667

mean(test.data$default=="Yes")

## [1] 0.03733333

my.data <- train.data[,c("balance","income")]
my.data$default <- ifelse(train.data$default=="Yes",1,0)
plot(my.data$balance,my.data$income,col=factor(my.data$default),cex=.5)

Fit the logistic regression model based on the training data. To train this model, regress default against the three features student, balance, and income. Also compute the training error and testing error for this model.

train.data$default <- ifelse(train.data$default=="Yes",1,0)
test.data$default <- ifelse(test.data$default=="Yes",1,0)
glm.fit <- glm(default ~ student + balance + income, data = train.data, family = binomial)
#glm(default ~ student + balance + income, data = train.data, family = binomial("logit")) #same result as above
summary(glm.fit)

## 
## Call:
## glm(formula = default ~ student + balance + income, family = binomial, 
##     data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1159  -0.1436  -0.0569  -0.0208   3.6714  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.060e+01  5.928e-01 -17.885   <2e-16 ***
## studentYes  -6.026e-01  2.823e-01  -2.135   0.0328 *  
## balance      5.651e-03  2.824e-04  20.012   <2e-16 ***
## income      -2.637e-06  9.922e-06  -0.266   0.7904    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1962.3  on 6999  degrees of freedom
## Residual deviance: 1090.7  on 6996  degrees of freedom
## AIC: 1098.7
## 
## Number of Fisher Scoring iterations: 8

#### Method1 use confusion matrix
print("Method1 Use Confusion Matrix")

## [1] "Method1 Use Confusion Matrix"

sigmoid <- function(u) {1/(1+exp(-u))}
y <-  train.data$default #; y
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat.char <- ifelse(sigmoid(predict(glm.fit))>.5,"Y","N")
# if without using the sigmoid function then use:  
# predict(glm_fit, type="response")
# it is the same as
# predict(model, type="response")[1:5] ==sigmoid(predict(model))[1:5]
# predict(model, type="response")[1:5] ==exp(predict(model)[1:5]) / (1+ exp(predict(model)[1:5]))
table(y.hat.char,y.char)

##           y.char
## y.hat.char    N    Y
##          N 6754  163
##          Y   25   58

#Training error
mean(y.hat.char!=y.char)

## [1] 0.02685714

# Confusion Matrix extraction
(table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))

## [1] 0.02685714

###Test error
y <-  test.data$default #; y
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat.char <- ifelse(predict(glm.fit, newdata = test.data, type="response")>.5,"Y","N")
table(y.hat.char,y.char)

##           y.char
## y.hat.char    N    Y
##          N 2876   68
##          Y   12   44

#Test error
mean(y.hat.char!=y.char)

## [1] 0.02666667

# Confusion Matrix extraction
(table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))

## [1] 0.02666667

#### Method2 compute directly
print("Method2 Compute Directly")

## [1] "Method2 Compute Directly"

train.data.new <- train.data %>% add_predictions(model = glm.fit, var = "pred") %>%
  mutate(sigmoid.pred = sigmoid(pred)) %>%mutate(glm.pred=ifelse(sigmoid.pred>0.5,1,0))
train.data.new <- as_tibble(train.data.new)


#training error
sum(train.data.new$default != train.data.new$glm.pred) / nrow(train.data.new)

## [1] 0.02685714

#test error
test.data.new <- test.data %>% add_predictions(model = glm.fit, var = "pred") %>%
  mutate(sigmoid.pred = sigmoid(pred)) %>% mutate(glm.pred=ifelse(sigmoid.pred>0.5,1,0))
test.data.new <- as_tibble(test.data.new)
sum(test.data.new$default != test.data.new$glm.pred) / nrow(test.data.new)

## [1] 0.02666667

Write a function named ROC.logistic() that inputs the trained logistic model and an evaluation dataset (train or test). The function should output; i) ROC curve, AUC, and iii) the optimal threshold based on minimum 0-1 loss. Note in class we introduced Youden’s \(J\) statistic but we will use the more common 0-1 loss.

See the class example for a more detained description.

sigmoid <- function(u) {1/(1+exp(-u))}

ROC.logistic <- function(model, data){
  
  y <-  data$default
  y.char <- factor(ifelse(y==1,"Y","N"))
  
  
  y.hat <- predict(model,newdata=data[,c("balance","student","income")], type="response")
  # Define threshold vector 
  prob.vec <- seq(0,1,by=.005)
  R <- length(prob.vec)

  # Define empty vectors for TP and FP rates
  true.pos <- rep(NA,R)
  false.pos <- rep(NA,R)
  
  error <- rep(NA,R)
  error.confusion <- rep(NA,R)

  # Loop
  # This loop computes TP and FP for each threshold
  # We are not required to estmiate multiple models, just the one lda()
  for (i in 1:R) {
    y.hat.char <- factor(ifelse(y.hat>prob.vec[i],"Y","N"),levels=c("N","Y"))
  
    # TP = P(Y.hat = 1 | Y = 1)
    true.pos[i] <- table(y.hat.char,y.char)[2,2]/sum(table(y.hat.char,y.char)[,2])

    # FP = P(Y.hat = 1 | Y = 0)
    false.pos[i] <- table(y.hat.char,y.char)[2,1]/sum(table(y.hat.char,y.char)[,1])
   
    #Test error
    error[i] <- mean(y.hat.char!=y.char)

    # Confusion Matrix extraction
    error.confusion[i] <- (table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
  }
  
  auc <- sum((true.pos[order(false.pos)][-1])*diff(false.pos[order(false.pos)]))
  
  {plot(false.pos,true.pos,type="l",col="blue",main="ROC: Model = Logistic Regression")
  abline(a=0,b=1,lty=3)}
  
  optimal.thres <- prob.vec[which(error == min(error))]
  #optimal.thres2 <- prob.vec[which(error.confusion == min(error.confusion))]
  
  cat("AUC:", auc,"\n")
  cat("The optimal threshold based on minimum 0-1 loss:", optimal.thres)#,"\n",optimal.thres2)
  return(tibble(auc = auc, optimal.thres = optimal.thres))
  
}

Run ROC.logistic() on both the training and testing data.

cat("Train Data:")

## Train Data:

ROC.logistic(model = glm.fit, data = train.data)

## AUC: 0.9515385 
## The optimal threshold based on minimum 0-1 loss: 0.46 0.475 0.48

cat("Test Data:")

## Test Data:

ROC.logistic(model = glm.fit, data = test.data)

## AUC: 0.9596545 
## The optimal threshold based on minimum 0-1 loss: 0.48 0.485

Compute the test error for the \(p=.5\) threshold versus the optimal threshold based on the test data. How do they compare? Should the analyst change the threshold from .5?

ans <- ROC.logistic(model = glm.fit, data = test.data)

## AUC: 0.9596545 
## The optimal threshold based on minimum 0-1 loss: 0.48 0.485

#test error p=0.5
test.data.new <- test.data %>% add_predictions(model = glm.fit, var = "pred") %>%
  mutate(sigmoid.pred = sigmoid(pred)) %>% mutate(glm.pred_0.5 = ifelse(sigmoid.pred>0.5,1,0),
                                                  glm.pred_1 = ifelse(sigmoid.pred>ans$optimal.thres[1],1,0),
                                                  glm.pred_2 = ifelse(sigmoid.pred>ans$optimal.thres[2],1,0))
test.data.new <- as_tibble(test.data.new)

cat("Threshold p = 0.5, Test Error:", sum(test.data.new$default != test.data.new$glm.pred_0.5) / nrow(test.data.new))

## Threshold p = 0.5, Test Error: 0.02666667

cat("Threshold p =", ans$optimal.thres[1], ", Test Error:", sum(test.data.new$default != test.data.new$glm.pred_1) / nrow(test.data.new))

## Threshold p = 0.48 , Test Error: 0.02633333

cat("Threshold p =", ans$optimal.thres[2], ", Test Error:", sum(test.data.new$default != test.data.new$glm.pred_2) / nrow(test.data.new))

## Threshold p = 0.485 , Test Error: 0.02633333

## Based on the test data, the optimal thresholds derived from the 0-1 loss have lower error rate compared to that of setting threshold as 0.5.
## The error rate would reduce from 0.02666667 to 0.02633333 if one uses the optimal thresholds insead of 0.5. But the difference is considered to be pretty small. Therefore, using p = 0.5 as the threshold is still acceptable in this case.

Credit Card Default Prediction

Yi-Pei Chan

Description