Description

The dataset Default is taken from the ISLR package.

dim(Default)
## [1] 10000     4
head(Default,4)

The below code constructs a training and testing set using a 70% (train) 30% (test) split.

n.full <- nrow(Default)
n.full
## [1] 10000
n.test <- floor(.3*n.full)
n.test
## [1] 3000
set.seed(0) #use seed 0 
test.index <- sample(1:n.full,n.test)
test.data <- Default[test.index,] 
train.data <- Default[-test.index,] 
n.train <- nrow(train.data)
n.train
## [1] 7000
mean(train.data$student=="Yes" & train.data$default=="Yes")
## [1] 0.01357143
mean(train.data$default=="Yes")
## [1] 0.03157143
mean(test.data$student=="Yes" & test.data$default=="Yes")
## [1] 0.01066667
mean(test.data$default=="Yes")
## [1] 0.03733333
my.data <- train.data[,c("balance","income")]
my.data$default <- ifelse(train.data$default=="Yes",1,0)
plot(my.data$balance,my.data$income,col=factor(my.data$default),cex=.5)

  1. Fit the logistic regression model based on the training data. To train this model, regress default against the three features student, balance, and income. Also compute the training error and testing error for this model.
train.data$default <- ifelse(train.data$default=="Yes",1,0)
test.data$default <- ifelse(test.data$default=="Yes",1,0)
glm.fit <- glm(default ~ student + balance + income, data = train.data, family = binomial)
#glm(default ~ student + balance + income, data = train.data, family = binomial("logit")) #same result as above
summary(glm.fit)
## 
## Call:
## glm(formula = default ~ student + balance + income, family = binomial, 
##     data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1159  -0.1436  -0.0569  -0.0208   3.6714  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.060e+01  5.928e-01 -17.885   <2e-16 ***
## studentYes  -6.026e-01  2.823e-01  -2.135   0.0328 *  
## balance      5.651e-03  2.824e-04  20.012   <2e-16 ***
## income      -2.637e-06  9.922e-06  -0.266   0.7904    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1962.3  on 6999  degrees of freedom
## Residual deviance: 1090.7  on 6996  degrees of freedom
## AIC: 1098.7
## 
## Number of Fisher Scoring iterations: 8
#### Method1 use confusion matrix
print("Method1 Use Confusion Matrix")
## [1] "Method1 Use Confusion Matrix"
sigmoid <- function(u) {1/(1+exp(-u))}
y <-  train.data$default #; y
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat.char <- ifelse(sigmoid(predict(glm.fit))>.5,"Y","N")
# if without using the sigmoid function then use:  
# predict(glm_fit, type="response")
# it is the same as
# predict(model, type="response")[1:5] ==sigmoid(predict(model))[1:5]
# predict(model, type="response")[1:5] ==exp(predict(model)[1:5]) / (1+ exp(predict(model)[1:5]))
table(y.hat.char,y.char)
##           y.char
## y.hat.char    N    Y
##          N 6754  163
##          Y   25   58
#Training error
mean(y.hat.char!=y.char)
## [1] 0.02685714
# Confusion Matrix extraction
(table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
## [1] 0.02685714
###Test error
y <-  test.data$default #; y
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat.char <- ifelse(predict(glm.fit, newdata = test.data, type="response")>.5,"Y","N")
table(y.hat.char,y.char)
##           y.char
## y.hat.char    N    Y
##          N 2876   68
##          Y   12   44
#Test error
mean(y.hat.char!=y.char)
## [1] 0.02666667
# Confusion Matrix extraction
(table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
## [1] 0.02666667
#### Method2 compute directly
print("Method2 Compute Directly")
## [1] "Method2 Compute Directly"
train.data.new <- train.data %>% add_predictions(model = glm.fit, var = "pred") %>%
  mutate(sigmoid.pred = sigmoid(pred)) %>%mutate(glm.pred=ifelse(sigmoid.pred>0.5,1,0))
train.data.new <- as_tibble(train.data.new)


#training error
sum(train.data.new$default != train.data.new$glm.pred) / nrow(train.data.new)
## [1] 0.02685714
#test error
test.data.new <- test.data %>% add_predictions(model = glm.fit, var = "pred") %>%
  mutate(sigmoid.pred = sigmoid(pred)) %>% mutate(glm.pred=ifelse(sigmoid.pred>0.5,1,0))
test.data.new <- as_tibble(test.data.new)
sum(test.data.new$default != test.data.new$glm.pred) / nrow(test.data.new)
## [1] 0.02666667
  1. Write a function named ROC.logistic() that inputs the trained logistic model and an evaluation dataset (train or test). The function should output; i) ROC curve, AUC, and iii) the optimal threshold based on minimum 0-1 loss. Note in class we introduced Youden’s \(J\) statistic but we will use the more common 0-1 loss.

See the class example for a more detained description.

sigmoid <- function(u) {1/(1+exp(-u))}

ROC.logistic <- function(model, data){
  
  y <-  data$default
  y.char <- factor(ifelse(y==1,"Y","N"))
  
  
  y.hat <- predict(model,newdata=data[,c("balance","student","income")], type="response")
  # Define threshold vector 
  prob.vec <- seq(0,1,by=.005)
  R <- length(prob.vec)

  # Define empty vectors for TP and FP rates
  true.pos <- rep(NA,R)
  false.pos <- rep(NA,R)
  
  error <- rep(NA,R)
  error.confusion <- rep(NA,R)

  # Loop
  # This loop computes TP and FP for each threshold
  # We are not required to estmiate multiple models, just the one lda()
  for (i in 1:R) {
    y.hat.char <- factor(ifelse(y.hat>prob.vec[i],"Y","N"),levels=c("N","Y"))
  
    # TP = P(Y.hat = 1 | Y = 1)
    true.pos[i] <- table(y.hat.char,y.char)[2,2]/sum(table(y.hat.char,y.char)[,2])

    # FP = P(Y.hat = 1 | Y = 0)
    false.pos[i] <- table(y.hat.char,y.char)[2,1]/sum(table(y.hat.char,y.char)[,1])
   
    #Test error
    error[i] <- mean(y.hat.char!=y.char)

    # Confusion Matrix extraction
    error.confusion[i] <- (table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
  }
  
  auc <- sum((true.pos[order(false.pos)][-1])*diff(false.pos[order(false.pos)]))
  
  {plot(false.pos,true.pos,type="l",col="blue",main="ROC: Model = Logistic Regression")
  abline(a=0,b=1,lty=3)}
  
  optimal.thres <- prob.vec[which(error == min(error))]
  #optimal.thres2 <- prob.vec[which(error.confusion == min(error.confusion))]
  
  cat("AUC:", auc,"\n")
  cat("The optimal threshold based on minimum 0-1 loss:", optimal.thres)#,"\n",optimal.thres2)
  return(tibble(auc = auc, optimal.thres = optimal.thres))
  
}
  1. Run ROC.logistic() on both the training and testing data.
cat("Train Data:")
## Train Data:
ROC.logistic(model = glm.fit, data = train.data)

## AUC: 0.9515385 
## The optimal threshold based on minimum 0-1 loss: 0.46 0.475 0.48
cat("Test Data:")
## Test Data:
ROC.logistic(model = glm.fit, data = test.data)

## AUC: 0.9596545 
## The optimal threshold based on minimum 0-1 loss: 0.48 0.485
  1. Compute the test error for the \(p=.5\) threshold versus the optimal threshold based on the test data. How do they compare? Should the analyst change the threshold from .5?
ans <- ROC.logistic(model = glm.fit, data = test.data)

## AUC: 0.9596545 
## The optimal threshold based on minimum 0-1 loss: 0.48 0.485
#test error p=0.5
test.data.new <- test.data %>% add_predictions(model = glm.fit, var = "pred") %>%
  mutate(sigmoid.pred = sigmoid(pred)) %>% mutate(glm.pred_0.5 = ifelse(sigmoid.pred>0.5,1,0),
                                                  glm.pred_1 = ifelse(sigmoid.pred>ans$optimal.thres[1],1,0),
                                                  glm.pred_2 = ifelse(sigmoid.pred>ans$optimal.thres[2],1,0))
test.data.new <- as_tibble(test.data.new)

cat("Threshold p = 0.5, Test Error:", sum(test.data.new$default != test.data.new$glm.pred_0.5) / nrow(test.data.new))
## Threshold p = 0.5, Test Error: 0.02666667
cat("Threshold p =", ans$optimal.thres[1], ", Test Error:", sum(test.data.new$default != test.data.new$glm.pred_1) / nrow(test.data.new))
## Threshold p = 0.48 , Test Error: 0.02633333
cat("Threshold p =", ans$optimal.thres[2], ", Test Error:", sum(test.data.new$default != test.data.new$glm.pred_2) / nrow(test.data.new))
## Threshold p = 0.485 , Test Error: 0.02633333
## Based on the test data, the optimal thresholds derived from the 0-1 loss have lower error rate compared to that of setting threshold as 0.5.
## The error rate would reduce from 0.02666667 to 0.02633333 if one uses the optimal thresholds insead of 0.5. But the difference is considered to be pretty small. Therefore, using p = 0.5 as the threshold is still acceptable in this case.