Stats101a Homework8
Stats101a Homework8
Stats101a Homework8
##
## Call:
## lm(formula = Balance ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -165.39 -79.18 -12.13 58.35 275.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -438.42608 42.49859 -10.316 < 2e-16 ***
## Income -7.66770 0.27565 -27.817 < 2e-16 ***
## Limit 0.18119 0.03925 4.617 6.05e-06 ***
## Rating 1.20753 0.58569 2.062 0.04020 *
## Cards 14.62746 5.07034 2.885 0.00423 **
## Age -1.03377 0.35347 -2.925 0.00374 **
## Education -0.75296 1.87220 -0.402 0.68787
## GenderFemale -8.79022 11.98775 -0.733 0.46404
## StudentYes 405.98264 19.62270 20.689 < 2e-16 ***
## MarriedYes -5.92417 12.74571 -0.465 0.64245
## EthnicityAsian 29.22944 17.17280 1.702 0.08990 .
## EthnicityCaucasian 6.36199 14.28007 0.446 0.65631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.01 on 268 degrees of freedom
## Multiple R-squared: 0.9507, Adjusted R-squared: 0.9487
## F-statistic: 469.6 on 11 and 268 DF, p-value: < 2.2e-16
library(DAAG)
vif(creditm1)
1
## 2.5667 214.2900 214.1000
## Cards Age Education
## 1.4726 1.0428 1.0341
## GenderFemale StudentYes MarriedYes
## 1.0262 1.0522 1.0606
## EthnicityAsian EthnicityCaucasian
## 1.4689 1.4555
par(mfrow=c(2,2))
plot(creditm1)
Standardized residuals
Residuals vs Fitted Normal QQ
3
96 121 121
Residuals
269 96
269
200 100
1
1
0 500 1000 1500 3 2 1 0 1 2 3
Standardized residuals
86
1.0
2 0
Cook's distance
0.0
Limit and Rating have very high vif, they are highly correlated to other variables.
The residuals are not randomly distributed around zero line, instead they follow a trend. The normality of
residuals does not seem to be violated, but the variance of residuals is not constant. Finally, there are not
many extreme leverage points.
library(car)
summary(powerTransform(cbind(train$Balance,train$Income,train$Limit,train$Rating,train$Cards,
train$Age,train$Education)))
2
## Y3 0.6612 0.0433 0.5763 0.7461
## Y4 0.5687 0.0528 0.4653 0.6722
## Y5 0.3516 0.0984 0.1587 0.5445
## Y6 0.7956 0.1975 0.4084 1.1828
## Y7 1.3834 0.2260 0.9404 1.8264
##
## Likelihood ratio tests about transformation parameters
## LRT df pval
## LR test, lambda = (0 0 0 0 0 0 0) 712.14383 7 0.000000000
## LR test, lambda = (1 1 1 1 1 1 1) 970.04152 7 0.000000000
## LR test, lambda = (0.33 0.33 0.66 0.5 0.5 1 1) 21.78435 7 0.002767181
newbalance<-sqrt(train$Balance)
newincome<-sqrt(train$Income)
newlimit<-sqrt(train$Limit)
newrating<-sqrt(train$Rating)
newcard<-sqrt(train$Cards)
newage<-sqrt(train$Age)
newedu<-train$Education^(1.5)
m3 <- lm(newbalance~newincome+newlimit+newrating+newcard+newage+newedu+
train$Gender+train$Student+train$Married+train$Ethnicity)
par(mfrow=c(2,2))
plot(m3)
Standardized residuals
0 2
0
10
45 53
103 53
10345
0 10 20 30 40 3 2 1 0 1 2 3
Standardized residuals
4 1
Cook's distance
0.0
45
3
step(m3,direction="backward",data=train)
## Start: AIC=594.65
## newbalance ~ newincome + newlimit + newrating + newcard + newage +
## newedu + train$Gender + train$Student + train$Married + train$Ethnicity
##
## Df Sum of Sq RSS AIC
## - train$Gender 1 0.1 2149.3 592.67
## - newcard 1 1.9 2151.0 592.90
## - train$Married 1 4.7 2153.8 593.26
## <none> 2149.1 594.65
## - newedu 1 15.8 2165.0 594.70
## - train$Ethnicity 2 42.3 2191.4 596.11
## - newlimit 1 36.9 2186.0 597.41
## - newage 1 51.5 2200.6 599.28
## - newrating 1 244.7 2393.8 622.84
## - train$Student 1 2886.6 5035.7 831.07
## - newincome 1 6006.2 8155.4 966.06
##
## Step: AIC=592.67
## newbalance ~ newincome + newlimit + newrating + newcard + newage +
## newedu + train$Student + train$Married + train$Ethnicity
##
## Df Sum of Sq RSS AIC
## - newcard 1 1.9 2151.1 590.91
## - train$Married 1 4.7 2153.9 591.27
## <none> 2149.3 592.67
## - newedu 1 15.9 2165.2 592.73
## - train$Ethnicity 2 42.3 2191.6 594.12
## - newlimit 1 36.8 2186.1 595.42
## - newage 1 51.4 2200.6 597.28
## - newrating 1 244.7 2394.0 620.86
## - train$Student 1 2907.5 5056.8 830.23
## - newincome 1 6009.6 8158.8 964.18
##
## Step: AIC=590.91
## newbalance ~ newincome + newlimit + newrating + newage + newedu +
## train$Student + train$Married + train$Ethnicity
##
## Df Sum of Sq RSS AIC
## - train$Married 1 5.2 2156.4 589.59
## <none> 2151.1 590.91
## - newedu 1 16.7 2167.9 591.08
## - train$Ethnicity 2 42.1 2193.2 592.34
## - newlimit 1 39.1 2190.2 593.95
## - newage 1 51.2 2202.4 595.50
## - newrating 1 364.9 2516.1 632.79
## - train$Student 1 2906.2 5057.4 828.27
## - newincome 1 6151.5 8302.6 967.07
##
## Step: AIC=589.59
## newbalance ~ newincome + newlimit + newrating + newage + newedu +
## train$Student + train$Ethnicity
4
##
## Df Sum of Sq RSS AIC
## <none> 2156.4 589.59
## - newedu 1 19.0 2175.4 590.04
## - train$Ethnicity 2 39.5 2195.9 590.67
## - newlimit 1 40.8 2197.2 592.84
## - newage 1 49.2 2205.6 593.91
## - newrating 1 360.9 2517.3 630.92
## - train$Student 1 2956.2 5112.6 829.31
## - newincome 1 6179.7 8336.1 966.20
##
## Call:
## lm(formula = newbalance ~ newincome + newlimit + newrating +
## newage + newedu + train$Student + train$Ethnicity)
##
## Coefficients:
## (Intercept) newincome
## -31.4254 -2.9678
## newlimit newrating
## 0.2400 2.9804
## newage newedu
## -0.3609 -0.0152
## train$StudentYes train$EthnicityAsian
## 10.6282 0.7629
## train$EthnicityCaucasian
## 0.8834
newstudent<-train$Student
newethnicity<-train$Ethnicity
m4<-lm(newbalance ~ newincome + newlimit + newrating + newage + newedu + newstudent + newethnicity)
summary(m4)
##
## Call:
## lm(formula = newbalance ~ newincome + newlimit + newrating +
## newage + newedu + newstudent + newethnicity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6922 -1.2015 0.3487 1.6157 7.3145
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -31.425369 1.784104 -17.614 < 2e-16 ***
## newincome -2.967750 0.106493 -27.868 < 2e-16 ***
## newlimit 0.240021 0.105982 2.265 0.0243 *
## newrating 2.980358 0.442522 6.735 9.79e-11 ***
## newage -0.360880 0.145137 -2.486 0.0135 *
## newedu -0.015200 0.009843 -1.544 0.1237
## newstudentYes 10.628231 0.551405 19.275 < 2e-16 ***
## newethnicityAsian 0.762897 0.485035 1.573 0.1169
## newethnicityCaucasian 0.883443 0.405000 2.181 0.0300 *
5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.821 on 271 degrees of freedom
## Multiple R-squared: 0.9504, Adjusted R-squared: 0.9489
## F-statistic: 649.1 on 8 and 271 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(m4)
Standardized residuals
Residuals vs Fitted Normal QQ
Residuals
0 2
0
10
3
103 53 53
45 10345
0 10 20 30 40 3 2 1 0 1 2 3
Standardized residuals
17
1.0
4 1
Cook's distance
0.0
45
There is still a trend in residuals plot. Not all residuals satisties normality. Assumption of constant variance
is also violated. There is not too many bad leverage points.
test$Balance<-sqrt(test$Balance);colnames(test)[11]<-"newbalance"
test$Income<-sqrt(test$Income);colnames(test)[1]<-"newincome"
test$Limit<-sqrt(test$Limit);colnames(test)[2]<-"newlimit"
test$Rating<-sqrt(test$Rating);colnames(test)[3]<-"newrating"
test$Age<-sqrt(test$Age);colnames(test)[5]<-"newage"
test$Education<-test$Education^(1.5);colnames(test)[6]<-"newedu"
colnames(test)[8] <- "newstudent"
colnames(test)[10] <- "newethnicity"
yhat<-predict(m4,newdata=test)
testlm <- lm(yhat ~ test$newbalance)
summary(testlm)
6
##
## Call:
## lm(formula = yhat ~ test$newbalance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9074 -1.7807 -0.1424 1.2169 7.9342
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4823 0.4687 3.162 0.00199 **
## test$newbalance 0.9308 0.0199 46.786 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.897 on 118 degrees of freedom
## Multiple R-squared: 0.9489, Adjusted R-squared: 0.9484
## F-statistic: 2189 on 1 and 118 DF, p-value: < 2.2e-16
plot(test$newbalance,yhat)
lines(seq(0,50,0.1),seq(0,50,0.1),col="red")
40
30
yhat
20
10
0
10
0 10 20 30 40
test$newbalance
The model for yhat vs actual balance is good since the R Squared is 0.9489. However, since there are many
Balance = 0 in our reality data, which means there exists some people do not pay out their balance. Therefore,
in our predicted y vs actual y plot, there are some points gather at x = 0. We should investigate them about
why they cannot pay the balance and may be we should exclude them when modeling the data.