The dataset Default is taken from the ISLR package.
dim(Default)
## [1] 10000 4
head(Default,4)
The below code constructs a training and testing set using a 70% (train) 30% (test) split.
n.full <- nrow(Default)
n.full
## [1] 10000
n.test <- floor(.3*n.full)
n.test
## [1] 3000
set.seed(0) #use seed 0
test.index <- sample(1:n.full,n.test)
test.data <- Default[test.index,]
train.data <- Default[-test.index,]
n.train <- nrow(train.data)
n.train
## [1] 7000
mean(train.data$student=="Yes" & train.data$default=="Yes")
## [1] 0.01357143
mean(train.data$default=="Yes")
## [1] 0.03157143
mean(test.data$student=="Yes" & test.data$default=="Yes")
## [1] 0.01066667
mean(test.data$default=="Yes")
## [1] 0.03733333
my.data <- train.data[,c("balance","income")]
my.data$default <- ifelse(train.data$default=="Yes",1,0)
plot(my.data$balance,my.data$income,col=factor(my.data$default),cex=.5)
train.data$default <- ifelse(train.data$default=="Yes",1,0)
test.data$default <- ifelse(test.data$default=="Yes",1,0)
glm.fit <- glm(default ~ student + balance + income, data = train.data, family = binomial)
#glm(default ~ student + balance + income, data = train.data, family = binomial("logit")) #same result as above
summary(glm.fit)
##
## Call:
## glm(formula = default ~ student + balance + income, family = binomial,
## data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1159 -0.1436 -0.0569 -0.0208 3.6714
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.060e+01 5.928e-01 -17.885 <2e-16 ***
## studentYes -6.026e-01 2.823e-01 -2.135 0.0328 *
## balance 5.651e-03 2.824e-04 20.012 <2e-16 ***
## income -2.637e-06 9.922e-06 -0.266 0.7904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1962.3 on 6999 degrees of freedom
## Residual deviance: 1090.7 on 6996 degrees of freedom
## AIC: 1098.7
##
## Number of Fisher Scoring iterations: 8
#### Method1 use confusion matrix
print("Method1 Use Confusion Matrix")
## [1] "Method1 Use Confusion Matrix"
sigmoid <- function(u) {1/(1+exp(-u))}
y <- train.data$default #; y
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat.char <- ifelse(sigmoid(predict(glm.fit))>.5,"Y","N")
# if without using the sigmoid function then use:
# predict(glm_fit, type="response")
# it is the same as
# predict(model, type="response")[1:5] ==sigmoid(predict(model))[1:5]
# predict(model, type="response")[1:5] ==exp(predict(model)[1:5]) / (1+ exp(predict(model)[1:5]))
table(y.hat.char,y.char)
## y.char
## y.hat.char N Y
## N 6754 163
## Y 25 58
#Training error
mean(y.hat.char!=y.char)
## [1] 0.02685714
# Confusion Matrix extraction
(table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
## [1] 0.02685714
###Test error
y <- test.data$default #; y
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat.char <- ifelse(predict(glm.fit, newdata = test.data, type="response")>.5,"Y","N")
table(y.hat.char,y.char)
## y.char
## y.hat.char N Y
## N 2876 68
## Y 12 44
#Test error
mean(y.hat.char!=y.char)
## [1] 0.02666667
# Confusion Matrix extraction
(table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
## [1] 0.02666667
#### Method2 compute directly
print("Method2 Compute Directly")
## [1] "Method2 Compute Directly"
train.data.new <- train.data %>% add_predictions(model = glm.fit, var = "pred") %>%
mutate(sigmoid.pred = sigmoid(pred)) %>%mutate(glm.pred=ifelse(sigmoid.pred>0.5,1,0))
train.data.new <- as_tibble(train.data.new)
#training error
sum(train.data.new$default != train.data.new$glm.pred) / nrow(train.data.new)
## [1] 0.02685714
#test error
test.data.new <- test.data %>% add_predictions(model = glm.fit, var = "pred") %>%
mutate(sigmoid.pred = sigmoid(pred)) %>% mutate(glm.pred=ifelse(sigmoid.pred>0.5,1,0))
test.data.new <- as_tibble(test.data.new)
sum(test.data.new$default != test.data.new$glm.pred) / nrow(test.data.new)
## [1] 0.02666667
See the class example for a more detained description.
sigmoid <- function(u) {1/(1+exp(-u))}
ROC.logistic <- function(model, data){
y <- data$default
y.char <- factor(ifelse(y==1,"Y","N"))
y.hat <- predict(model,newdata=data[,c("balance","student","income")], type="response")
# Define threshold vector
prob.vec <- seq(0,1,by=.005)
R <- length(prob.vec)
# Define empty vectors for TP and FP rates
true.pos <- rep(NA,R)
false.pos <- rep(NA,R)
error <- rep(NA,R)
error.confusion <- rep(NA,R)
# Loop
# This loop computes TP and FP for each threshold
# We are not required to estmiate multiple models, just the one lda()
for (i in 1:R) {
y.hat.char <- factor(ifelse(y.hat>prob.vec[i],"Y","N"),levels=c("N","Y"))
# TP = P(Y.hat = 1 | Y = 1)
true.pos[i] <- table(y.hat.char,y.char)[2,2]/sum(table(y.hat.char,y.char)[,2])
# FP = P(Y.hat = 1 | Y = 0)
false.pos[i] <- table(y.hat.char,y.char)[2,1]/sum(table(y.hat.char,y.char)[,1])
#Test error
error[i] <- mean(y.hat.char!=y.char)
# Confusion Matrix extraction
error.confusion[i] <- (table(y.hat.char,y.char)[1,2]+table(y.hat.char,y.char)[2,1])/sum(table(y.hat.char,y.char))
}
auc <- sum((true.pos[order(false.pos)][-1])*diff(false.pos[order(false.pos)]))
{plot(false.pos,true.pos,type="l",col="blue",main="ROC: Model = Logistic Regression")
abline(a=0,b=1,lty=3)}
optimal.thres <- prob.vec[which(error == min(error))]
#optimal.thres2 <- prob.vec[which(error.confusion == min(error.confusion))]
cat("AUC:", auc,"\n")
cat("The optimal threshold based on minimum 0-1 loss:", optimal.thres)#,"\n",optimal.thres2)
return(tibble(auc = auc, optimal.thres = optimal.thres))
}
cat("Train Data:")
## Train Data:
ROC.logistic(model = glm.fit, data = train.data)
## AUC: 0.9515385
## The optimal threshold based on minimum 0-1 loss: 0.46 0.475 0.48
cat("Test Data:")
## Test Data:
ROC.logistic(model = glm.fit, data = test.data)
## AUC: 0.9596545
## The optimal threshold based on minimum 0-1 loss: 0.48 0.485
ans <- ROC.logistic(model = glm.fit, data = test.data)
## AUC: 0.9596545
## The optimal threshold based on minimum 0-1 loss: 0.48 0.485
#test error p=0.5
test.data.new <- test.data %>% add_predictions(model = glm.fit, var = "pred") %>%
mutate(sigmoid.pred = sigmoid(pred)) %>% mutate(glm.pred_0.5 = ifelse(sigmoid.pred>0.5,1,0),
glm.pred_1 = ifelse(sigmoid.pred>ans$optimal.thres[1],1,0),
glm.pred_2 = ifelse(sigmoid.pred>ans$optimal.thres[2],1,0))
test.data.new <- as_tibble(test.data.new)
cat("Threshold p = 0.5, Test Error:", sum(test.data.new$default != test.data.new$glm.pred_0.5) / nrow(test.data.new))
## Threshold p = 0.5, Test Error: 0.02666667
cat("Threshold p =", ans$optimal.thres[1], ", Test Error:", sum(test.data.new$default != test.data.new$glm.pred_1) / nrow(test.data.new))
## Threshold p = 0.48 , Test Error: 0.02633333
cat("Threshold p =", ans$optimal.thres[2], ", Test Error:", sum(test.data.new$default != test.data.new$glm.pred_2) / nrow(test.data.new))
## Threshold p = 0.485 , Test Error: 0.02633333
## Based on the test data, the optimal thresholds derived from the 0-1 loss have lower error rate compared to that of setting threshold as 0.5.
## The error rate would reduce from 0.02666667 to 0.02633333 if one uses the optimal thresholds insead of 0.5. But the difference is considered to be pretty small. Therefore, using p = 0.5 as the threshold is still acceptable in this case.