Amta - Final - Notes.r: ### Step Wise AIC Regression

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 6

amta_final_notes.

R
Admin

Wed Aug 14 15:08:48 2019

### step wise AIC regression


library(caret) # to train the model

## Loading required package: lattice

## Loading required package: ggplot2

library(leaps) # to use lmStepAIC model in caret::train

## Warning: package 'leaps' was built under R version 3.5.1

library(car) # to check VIFs

## Warning: package 'car' was built under R version 3.5.1

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.5.2

df = mtcars

# step 1: formulating the model


lm.fit = lm(mpg ~., data = df)

# step 2: checking vif:


vif(lm.fit)

## cyl disp hp drat wt qsec vs


## 15.373833 21.620241 9.832037 3.374620 15.164887 7.527958 4.965873
## am gear carb
## 4.648487 5.357452 7.908747

# ideally vif for any Independent Variable should be less than 3


# all vifs greater than 3 are observed
# case of severe multicollinearity
# in case of severe multicollinearity while F statistic is significant, all
IVs become insignificant

# step 3: checking the summary of model


summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07

# all IVs are insignificant


# it means that no independent variable in the model determines mileage of
vehicle
# residual standard error = 2.65

# step 4: formulating step wise regression


step.fit = train(mpg ~., data = df,
method = "lmStepAIC", trace = F) # trace = F does not print
the intermediate output

# step 5: checking the step.fit statistics


summary(step.fit)

##
## Call:
## lm(formula = .outcome ~ wt + qsec + am, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11

# only three variables included in final model


# all three variables are significant
# residual standard error = 2.49 (better predictive power compared to lm.fit)

# step 6: checking AIC


AIC(lm.fit) # 163.7

## [1] 163.7098

AIC(step.fit$finalModel) #154.1

## [1] 154.1194

# lower AIC is better

### multinominal logistic regression


library(nnet) # for multinominal logistic regression
library(ISLR) # for dataset

df = Carseats
head(df)

## Sales CompPrice Income Advertising Population Price ShelveLoc Age


## 1 9.50 138 73 11 276 120 Bad 42
## 2 11.22 111 48 16 260 83 Good 65
## 3 10.06 113 35 10 269 80 Medium 59
## 4 7.40 117 100 4 466 97 Medium 55
## 5 4.15 141 64 3 340 128 Bad 38
## 6 10.81 124 113 13 501 72 Bad 78
## Education Urban US
## 1 17 Yes Yes
## 2 10 Yes Yes
## 3 12 Yes Yes
## 4 14 Yes Yes
## 5 13 Yes No
## 6 16 No Yes

# shelf location for carseats (ShelveLoc) is dependent variable.


# it has three values: Bad, Good, Medium; hence multinominal logistic
regression is to be done
# let the base value for ShelveLoc be "bad"

# step 1: releveling DV
class(df$ShelveLoc)

## [1] "factor"

df$ShelveLoc = relevel(df$ShelveLoc, ref = "Bad")

contrasts(df$ShelveLoc)

## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1

# bad has 0 through out; it means Bad is base category

# step 2: running the regression


mlog.fit = multinom(ShelveLoc ~., data = df)

## # weights: 36 (22 variable)


## initial value 439.444915
## iter 10 value 337.400387
## iter 20 value 224.900527
## iter 30 value 151.419007
## iter 40 value 151.301805
## final value 151.301711
## converged

summary(mlog.fit)

## Call:
## multinom(formula = ShelveLoc ~ ., data = df)
##
## Coefficients:
## (Intercept) Sales CompPrice Income Advertising
## Good -38.91396 4.903290 -0.4661398 -0.07635094 -0.5378368
## Medium -12.17893 1.950679 -0.1776329 -0.04191173 -0.2420112
## Population Price Age Education UrbanYes USYes
## Good -0.002085516 0.4818368 0.2371202 0.03187005 -1.6010195 0.5957802
## Medium -0.001034355 0.1882456 0.1010462 0.04009025 -0.6622705 0.4524779
##
## Std. Errors:
## (Intercept) Sales CompPrice Income Advertising
## Good 0.4927225 0.3419561 0.05210688 0.013054261 0.08636365
## Medium 2.0688642 0.2155638 0.02577319 0.008188235 0.04814687
## Population Price Age Education UrbanYes USYes
## Good 0.002381584 0.03919383 0.02364099 0.11858296 0.7821109 1.0365972
## Medium 0.001202848 0.02286454 0.01503519 0.06840688 0.4096022 0.5265365
##
## Residual Deviance: 302.6034
## AIC: 346.6034

# step 3: interpreting the regression coefficients


ans = summary(mlog.fit)$coeff
ans

## (Intercept) Sales CompPrice Income Advertising


## Good -38.91396 4.903290 -0.4661398 -0.07635094 -0.5378368
## Medium -12.17893 1.950679 -0.1776329 -0.04191173 -0.2420112
## Population Price Age Education UrbanYes USYes
## Good -0.002085516 0.4818368 0.2371202 0.03187005 -1.6010195 0.5957802
## Medium -0.001034355 0.1882456 0.1010462 0.04009025 -0.6622705 0.4524779

ans = t(ans) #transpose of matrix


ans

## Good Medium
## (Intercept) -38.913962021 -12.178929923
## Sales 4.903289749 1.950678641
## CompPrice -0.466139802 -0.177632921
## Income -0.076350942 -0.041911735
## Advertising -0.537836790 -0.242011158
## Population -0.002085516 -0.001034355
## Price 0.481836809 0.188245600
## Age 0.237120207 0.101046165
## Education 0.031870046 0.040090249
## UrbanYes -1.601019544 -0.662270549
## USYes 0.595780229 0.452477868

ans = data.frame(ans)
ans$exp.Good = exp(ans$Good)
ans$exp.Medium = exp(ans$Medium)
ans

## Good Medium exp.Good exp.Medium


## (Intercept) -38.913962021 -12.178929923 1.258581e-17 5.137572e-06
## Sales 4.903289749 1.950678641 1.347323e+02 7.033459e+00
## CompPrice -0.466139802 -0.177632921 6.274196e-01 8.372497e-01
## Income -0.076350942 -0.041911735 9.264910e-01 9.589544e-01
## Advertising -0.537836790 -0.242011158 5.840102e-01 7.850474e-01
## Population -0.002085516 -0.001034355 9.979167e-01 9.989662e-01
## Price 0.481836809 0.188245600 1.619046e+00 1.207130e+00
## Age 0.237120207 0.101046165 1.267593e+00 1.106328e+00
## Education 0.031870046 0.040090249 1.032383e+00 1.040905e+00
## UrbanYes -1.601019544 -0.662270549 2.016908e-01 5.156791e-01
## USYes 0.595780229 0.452477868 1.814446e+00 1.572203e+00

ans = round(ans, 2)
ans

## Good Medium exp.Good exp.Medium


## (Intercept) -38.91 -12.18 0.00 0.00
## Sales 4.90 1.95 134.73 7.03
## CompPrice -0.47 -0.18 0.63 0.84
## Income -0.08 -0.04 0.93 0.96
## Advertising -0.54 -0.24 0.58 0.79
## Population 0.00 0.00 1.00 1.00
## Price 0.48 0.19 1.62 1.21
## Age 0.24 0.10 1.27 1.11
## Education 0.03 0.04 1.03 1.04
## UrbanYes -1.60 -0.66 0.20 0.52
## USYes 0.60 0.45 1.81 1.57

# step 4: interpretation:

# interpeting Sales exp.Good and exp.Medium


# exp.Good for Sales = 134.73. It means all else constant, for unit increase
sales the odds of shifting Shelf location from
# Bad location to Good location increases by 134 times

# exp.Medium for Sales = 7.03. It means all else constant, for unit increase
sales the odds of shifting Shelf location from
# Bad location to medium location increases by 7.03 times

You might also like