Amta - Final - Notes.r: ### Step Wise AIC Regression

amta_final_notes.
R
Admin
Wed Aug 14 15:08:48 2019
### step wise AIC regression

library(caret) # to train the model
## Loading required package: lattice
## Loading required package: ggplot2
library(leaps) # to use lmStepAIC model in caret::train
## Warning: package 'leaps' was built under R version 3.5.1
library(car) # to check VIFs
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.5.2
df = mtcars
# step 1: formulating the model

lm.fit = lm(mpg ~., data = df)
# step 2: checking vif:

vif(lm.fit)
## cyl disp hp drat wt qsec vs

## 15.373833 21.620241 9.832037 3.374620 15.164887 7.527958 4.965873
## am gear carb
## 4.648487 5.357452 7.908747
# ideally vif for any Independent Variable should be less than 3

# all vifs greater than 3 are observed
# case of severe multicollinearity
# in case of severe multicollinearity while F statistic is significant, all
IVs become insignificant
# step 3: checking the summary of model

summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
# all IVs are insignificant

# it means that no independent variable in the model determines mileage of
vehicle
# residual standard error = 2.65
# step 4: formulating step wise regression

step.fit = train(mpg ~., data = df,
method = "lmStepAIC", trace = F) # trace = F does not print
the intermediate output
# step 5: checking the step.fit statistics

summary(step.fit)
##
## Call:
## lm(formula = .outcome ~ wt + qsec + am, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
# only three variables included in final model

# all three variables are significant
# residual standard error = 2.49 (better predictive power compared to lm.fit)
# step 6: checking AIC

AIC(lm.fit) # 163.7
## [1] 163.7098
AIC(step.fit$finalModel) #154.1
## [1] 154.1194
# lower AIC is better
### multinominal logistic regression

library(nnet) # for multinominal logistic regression
library(ISLR) # for dataset
df = Carseats
head(df)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age

## 1 9.50 138 73 11 276 120 Bad 42
## 2 11.22 111 48 16 260 83 Good 65
## 3 10.06 113 35 10 269 80 Medium 59
## 4 7.40 117 100 4 466 97 Medium 55
## 5 4.15 141 64 3 340 128 Bad 38
## 6 10.81 124 113 13 501 72 Bad 78
## Education Urban US
## 1 17 Yes Yes
## 2 10 Yes Yes
## 3 12 Yes Yes
## 4 14 Yes Yes
## 5 13 Yes No
## 6 16 No Yes
# shelf location for carseats (ShelveLoc) is dependent variable.

# it has three values: Bad, Good, Medium; hence multinominal logistic
regression is to be done
# let the base value for ShelveLoc be "bad"
# step 1: releveling DV
class(df$ShelveLoc)
## [1] "factor"
df$ShelveLoc = relevel(df$ShelveLoc, ref = "Bad")
contrasts(df$ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
# bad has 0 through out; it means Bad is base category
# step 2: running the regression

mlog.fit = multinom(ShelveLoc ~., data = df)
## # weights: 36 (22 variable)

## initial value 439.444915
## iter 10 value 337.400387
## iter 20 value 224.900527
## iter 30 value 151.419007
## iter 40 value 151.301805
## final value 151.301711
## converged
summary(mlog.fit)
## Call:
## multinom(formula = ShelveLoc ~ ., data = df)
##
## Coefficients:
## (Intercept) Sales CompPrice Income Advertising
## Good -38.91396 4.903290 -0.4661398 -0.07635094 -0.5378368
## Medium -12.17893 1.950679 -0.1776329 -0.04191173 -0.2420112
## Population Price Age Education UrbanYes USYes
## Good -0.002085516 0.4818368 0.2371202 0.03187005 -1.6010195 0.5957802
## Medium -0.001034355 0.1882456 0.1010462 0.04009025 -0.6622705 0.4524779
##
## Std. Errors:
## Good 0.4927225 0.3419561 0.05210688 0.013054261 0.08636365
## Medium 2.0688642 0.2155638 0.02577319 0.008188235 0.04814687
## Good 0.002381584 0.03919383 0.02364099 0.11858296 0.7821109 1.0365972
## Medium 0.001202848 0.02286454 0.01503519 0.06840688 0.4096022 0.5265365
##
## Residual Deviance: 302.6034
## AIC: 346.6034
# step 3: interpreting the regression coefficients

ans = summary(mlog.fit)$coeff
ans

## Good -38.91396 4.903290 -0.4661398 -0.07635094 -0.5378368
## Medium -12.17893 1.950679 -0.1776329 -0.04191173 -0.2420112
## Good -0.002085516 0.4818368 0.2371202 0.03187005 -1.6010195 0.5957802
## Medium -0.001034355 0.1882456 0.1010462 0.04009025 -0.6622705 0.4524779
ans = t(ans) #transpose of matrix

ans
## Good Medium
## (Intercept) -38.913962021 -12.178929923
## Sales 4.903289749 1.950678641
## CompPrice -0.466139802 -0.177632921
## Income -0.076350942 -0.041911735
## Advertising -0.537836790 -0.242011158
## Population -0.002085516 -0.001034355
## Price 0.481836809 0.188245600
## Age 0.237120207 0.101046165
## Education 0.031870046 0.040090249
## UrbanYes -1.601019544 -0.662270549
## USYes 0.595780229 0.452477868
ans = data.frame(ans)
ans$exp.Good = exp(ans$Good)
ans$exp.Medium = exp(ans$Medium)
ans
## Good Medium exp.Good exp.Medium

## (Intercept) -38.913962021 -12.178929923 1.258581e-17 5.137572e-06
## Sales 4.903289749 1.950678641 1.347323e+02 7.033459e+00
## CompPrice -0.466139802 -0.177632921 6.274196e-01 8.372497e-01
## Income -0.076350942 -0.041911735 9.264910e-01 9.589544e-01
## Advertising -0.537836790 -0.242011158 5.840102e-01 7.850474e-01
## Population -0.002085516 -0.001034355 9.979167e-01 9.989662e-01
## Price 0.481836809 0.188245600 1.619046e+00 1.207130e+00
## Age 0.237120207 0.101046165 1.267593e+00 1.106328e+00
## Education 0.031870046 0.040090249 1.032383e+00 1.040905e+00
## UrbanYes -1.601019544 -0.662270549 2.016908e-01 5.156791e-01
## USYes 0.595780229 0.452477868 1.814446e+00 1.572203e+00
ans = round(ans, 2)
ans
## Good Medium exp.Good exp.Medium

## (Intercept) -38.91 -12.18 0.00 0.00
## Sales 4.90 1.95 134.73 7.03
## CompPrice -0.47 -0.18 0.63 0.84
## Income -0.08 -0.04 0.93 0.96
## Advertising -0.54 -0.24 0.58 0.79
## Population 0.00 0.00 1.00 1.00
## Price 0.48 0.19 1.62 1.21
## Age 0.24 0.10 1.27 1.11
## Education 0.03 0.04 1.03 1.04
## UrbanYes -1.60 -0.66 0.20 0.52
## USYes 0.60 0.45 1.81 1.57
# step 4: interpretation:
# interpeting Sales exp.Good and exp.Medium

# exp.Good for Sales = 134.73. It means all else constant, for unit increase
sales the odds of shifting Shelf location from
# Bad location to Good location increases by 134 times
# exp.Medium for Sales = 7.03. It means all else constant, for unit increase
sales the odds of shifting Shelf location from
# Bad location to medium location increases by 7.03 times

Amta - Final - Notes.r: ### Step Wise AIC Regression

Uploaded by

Copyright:

Available Formats

Amta - Final - Notes.r: ### Step Wise AIC Regression

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Amta - Final - Notes.r: ### Step Wise AIC Regression

Uploaded by

Copyright:

Available Formats

amta_final_notes.

Wed Aug 14 15:08:48 2019

### step wise AIC regression

## Loading required package: lattice

## Loading required package: ggplot2

library(leaps) # to use lmStepAIC model in caret::train

## Warning: package 'leaps' was built under R version 3.5.1

library(car) # to check VIFs

## Warning: package 'car' was built under R version 3.5.1

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.5.2

# step 1: formulating the model

# step 2: checking vif:

## cyl disp hp drat wt qsec vs

# ideally vif for any Independent Variable should be less than 3

# step 3: checking the summary of model

# all IVs are insignificant

# step 4: formulating step wise regression

# step 5: checking the step.fit statistics

# only three variables included in final model

# step 6: checking AIC

# lower AIC is better

### multinominal logistic regression

## Sales CompPrice Income Advertising Population Price ShelveLoc Age

# shelf location for carseats (ShelveLoc) is dependent variable.

df$ShelveLoc = relevel(df$ShelveLoc, ref = "Bad")

# bad has 0 through out; it means Bad is base category

# step 2: running the regression

## # weights: 36 (22 variable)

# step 3: interpreting the regression coefficients

## (Intercept) Sales CompPrice Income Advertising

ans = t(ans) #transpose of matrix

## Good Medium exp.Good exp.Medium

## Good Medium exp.Good exp.Medium

# interpeting Sales exp.Good and exp.Medium

You might also like