Logistic Regression
Soumya Roy
# Load the R library "ISLR"
library(ISLR)
# Attach the "Default" data set available in "ISLR" library
attach(Default)
# Name of the variables in "Default" data set
names(Default)
## [1] "default" "student" "balance" "income"
# Dimension of the "Default" data set
dim(Default)
## [1] 10000 4
# Descriptive Summary of the data set
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
##Boxplots
boxplot(balance~default,col=(c("red","blue")),xlab="Default",ylab="Balance",m
ain="Balance vs Default")
boxplot(income~default,col=(c("red","blue")),xlab="Default",ylab="Balance",ma
in="Income vs Default")
## Barplot
T=table(default,student)
T
## student
## default No Yes
## No 6850 2817
## Yes 206 127
P=prop.table(T,margin=2)
P
## student
## default No Yes
## No 0.97080499 0.95686141
## Yes 0.02919501 0.04313859
barplot(P[2,],col=c("red","blue"),xlab="Student",ylab="Default Rate") #Second
Row of P gives the default rate
# Fitting a logistic regression model using the predictors "balance"
# The function "glm()" fits generalized linear models, a class of models that
includes logistic regression as a special case
# The function "glm()" is similar to that of "lm()", except that we have to p
ass on the argument "family=binomial" in order to fit a #logistic regression
model
mod_1=glm(default~balance,data=Default,family=binomial)
summary(mod_1)
##
## Call:
## glm(formula = default ~ balance, family = binomial, data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2697 -0.1465 -0.0589 -0.0221 3.7589
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
## balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
# Fitting a logistic regression model using the predictors "student"
mod_2=glm(default~student,data=Default,family=binomial)
summary(mod_2)
##
## Call:
## glm(formula = default ~ student, family = binomial, data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2970 -0.2970 -0.2434 -0.2434 2.6585
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.50413 0.07071 -49.55 < 2e-16 ***
## studentYes 0.40489 0.11502 3.52 0.000431 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 2908.7 on 9998 degrees of freedom
## AIC: 2912.7
##
## Number of Fisher Scoring iterations: 6
# Fitting a logistic regression model using the predictors "balance", "studen
t", and "income"
mod_3=glm(default~balance+student+income,data=Default,family=binomial)
summary(mod_3)
##
## Call:
## glm(formula = default ~ balance + student + income, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4691 -0.1418 -0.0557 -0.0203 3.7383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## income 3.033e-06 8.203e-06 0.370 0.71152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
# Getting the odds ratio and their 95% CI
require(MASS)
## Loading required package: MASS
exp(cbind(coef(mod_3), confint(mod_3)))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.903854e-05 7.074481e-06 0.0000487808
## balance 1.005753e+00 1.005309e+00 1.0062238757
## studentYes 5.237317e-01 3.298827e-01 0.8334223982
## income 1.000003e+00 9.999870e-01 1.0000191246
# Hosmer-Lemeshow Test for checking the model
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
default_new=ifelse(default=="Yes", 1, 0)
hoslem.test(default_new, fitted(mod_3))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: default_new, fitted(mod_3)
## X-squared = 3.6823, df = 8, p-value = 0.8846
# Using the "predict()" function to obtain the probabilities of the form "P(Y
=1|X)"
# The "type=response" ensures the output of the form "P(Y=1|X)", rather than
other information such as the logit
mod_3.probs=predict(mod_3,type="response")
# Printing first ten predicted probabilities
mod_3.probs[1:10]
## 1 2 3 4 5
6
## 1.428724e-03 1.122204e-03 9.812272e-03 4.415893e-04 1.935506e-03 1.989518e
-03
## 7 8 9 10
## 2.333767e-03 1.086718e-03 1.638333e-02 2.080617e-05
# Using the "contrast()" function to check the dummy variable created by R
contrasts(default)
## Yes
## No 0
## Yes 1
# Conversion of probabilities into class labels
mod_3.pred=rep("No",10000)
mod_3.pred[mod_3.probs>.5]="Yes"
# Creating Confusion Matrix to check how many observations are correctly or i
ncorrectly classified
table(mod_3.pred,default)
## default
## mod_3.pred No Yes
## No 9627 228
## Yes 40 105
# Calculating the fraction of days for which the prediction was correct
mean(mod_3.pred==default)
## [1] 0.9732
# Calculating the misclassification rate
mean(mod_3.pred!=default)
## [1] 0.0268
# Changing the cut-off
# Conversion of probabilities into class labels
mod_3.pred=rep("No",10000)
mod_3.pred[mod_3.probs>.2]="Yes"
# Creating Confusion Matrix to check how many observations are correctly or i
ncorrectly classified
table(mod_3.pred,default)
## default
## mod_3.pred No Yes
## No 9390 130
## Yes 277 203
# Calculating the fraction of days for which the prediction was correct
mean(mod_3.pred==default)
## [1] 0.9593
# Calculating the misclassification rate
mean(mod_3.pred!=default)
## [1] 0.0407
# ROC Plot
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
R=roc(default,mod_3.probs)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc(default,mod_3.probs),col="blue",legacy.axes = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
coords(R, "best", ret = "threshold")
## Warning in coords.roc(R, "best", ret = "threshold"): The 'transpose' argum
ent
## to FALSE by default since pROC 1.16. Set transpose = TRUE explicitly to re
vert
## to the previous behavior, or transpose = TRUE to silence this warning. Typ
e
## help(coords_transpose) for additional information.
## threshold
## 1 0.03120876
# Model Seletion
library(MASS)
stepAIC(mod_3,trace=F)
##
## Call: glm(formula = default ~ balance + student, family = binomial,
## data = Default)
##
## Coefficients:
## (Intercept) balance studentYes
## -10.749496 0.005738 -0.714878
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
## Null Deviance: 2921
## Residual Deviance: 1572 AIC: 1578
# Lift and Gain Charts
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
Default$mod_3.probs=predict(mod_3,type="response")
gain_lift(data=Default, score='mod_3.probs', target='default')
## Population Gain Lift Score.Point
## 1 10 78.38 7.84 0.07092534738
## 2 20 91.89 4.59 0.02104190396
## 3 30 96.70 3.22 0.00880320034
## 4 40 98.80 2.47 0.00401693056
## 5 50 99.10 1.98 0.00196619538
## 6 60 99.70 1.66 0.00094485119
## 7 70 100.00 1.43 0.00044286132
## 8 80 100.00 1.25 0.00017553872
## 9 90 100.00 1.11 0.00005139724
## 10 100 100.00 1.00 0.00001025695