DMPM-LAB-03-Assignment: Rcode
DMPM-LAB-03-Assignment: Rcode
R code :
## Read the dataset
toyota<-read.csv(file.choose())
View(toyota)
## data visualization
plot(Price~Age,data=auto)
cor(auto$Price,auto$Age)
plot(Price~KM,data=auto)
cor(auto$Price,auto$KM)
plot(Price~HP,data=auto)
cor(auto$Price,auto$HP)
plot(Price~MetColor,data=auto)
cor(auto$Price,auto$MetColor)
plot(Price~Automatic,data=auto)
cor(auto$Price,auto$Automatic)
plot(Price~CC,data=auto)
cor(auto$Price,auto$CC)
plot(Price~Doors,data=auto)
cor(auto$Price,auto$Doors)
plot(Price~Weight,data=auto)
cor(auto$Price,auto$Weight)
plot(Price~FuelType1,data=auto)
cor(auto$Price,auto$FuelType1)
set.seed(1)
## Fixing the seed value for the random selection guarantees the same results in repeated runs
n=length(auto$Price)
n1=900
n2=n-n1
n2
train=sample(1:n,n1)
# Now predict the response variable values "Price" for the test dataset
pred=predict(m1,newdat=auto[-train,])
obs=auto$Price[-train]
diff=obs-pred
percdiff=abs(diff)/obs
me=mean(diff)
rmse=sqrt(sum(diff**2)/n2)
mape=100*(mean(percdiff))
me # mean error
rmse # root mean square error
mape # mean absolute percent error
me=mean(diff)
rmse=sqrt(mean(diff**2))
mape=100*(mean(percdiff))
me # mean error
rmse # root mean square error
mape # mean absolute percent error
me=mean(diff)
rmse=sqrt(mean(diff**2))
mape=100*(mean(percdiff))
me # mean error
rmse # root mean square error
mape # mean absolute percent error
Console Output :
> ## Read the dataset
> toyota<-read.csv(file.choose())
> View(toyota)
> ##Explore the data
> str(toyota)
'data.frame':1436 obs. of 10 variables:
$ Price : int 13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
$ Age : int 23 23 24 26 30 32 27 30 27 23 ...
$ KM : int 46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
$ FuelType : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 2 2 2 2 3 2 ...
$ HP : int 90 90 90 90 90 90 90 90 192 69 ...
$ MetColor : int 1 1 1 0 0 0 1 1 0 0 ...
$ Automatic: int 0 0 0 0 0 0 0 0 0 0 ...
$ CC : int 2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
$ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
$ Weight : int 1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
> summary(toyota)
Price Age KM FuelType
Min. : 4350 Min. : 1.00 Min. : 1 CNG : 17
1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 43000 Diesel: 155
Median : 9900 Median :61.00 Median : 63390 Petrol:1264
Mean :10731 Mean :55.95 Mean : 68533
3rd Qu.:11950 3rd Qu.:70.00 3rd Qu.: 87021
Max. :32500 Max. :80.00 Max. :243000
HP MetColor Automatic CC
Min. : 69.0 Min. :0.0000 Min. :0.00000 Min. :1300
1st Qu.: 90.0 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1400
Median :110.0 Median :1.0000 Median :0.00000 Median :1600
Mean :101.5 Mean :0.6748 Mean :0.05571 Mean :1567
3rd Qu.:110.0 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1600
Max. :192.0 Max. :1.0000 Max. :1.00000 Max. :2000
Doors Weight
Min. :2.000 Min. :1000
1st Qu.:3.000 1st Qu.:1040
Median :4.000 Median :1070
Mean :4.033 Mean :1072
3rd Qu.:5.000 3rd Qu.:1085
Max. :5.000 Max. :1615
> hist(toyota$Price) ## Distribution of car prices
> ## create numeric variables for the categorical variable
> ## FuelType with its three nominal outcomes: CNG, Diesel, and Petrol
> drn <- factor(toyota$FuelType)
> toyota$FuelType1=as.numeric(drn)
> View(toyota)
> toyota[1:3,]
Price Age KM FuelType HP MetColor Automatic CC Doors Weight FuelType1
1 13500 23 46986 Diesel 90 1 0 2000 3 1165 2
2 13750 23 72937 Diesel 90 1 0 2000 3 1165 2
3 13950 24 41711 Diesel 90 1 0 2000 3 1165 2
> str(toyota)
'data.frame':1436 obs. of 11 variables:
$ Price : int 13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
$ Age : int 23 23 24 26 30 32 27 30 27 23 ...
$ KM : int 46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
$ FuelType : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 2 2 2 2 3 2 ...
$ HP : int 90 90 90 90 90 90 90 90 192 69 ...
$ MetColor : int 1 1 1 0 0 0 1 1 0 0 ...
$ Automatic: int 0 0 0 0 0 0 0 0 0 0 ...
$ CC : int 2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
$ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
$ Weight : int 1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
$ FuelType1: num 2 2 2 2 2 2 2 2 3 2 ...
> ## remove unwanted predictor
> auto=toyota[-4]
> View(auto)
> auto[1:3,]
Price Age KM HP MetColor Automatic CC Doors Weight FuelType1
1 13500 23 46986 90 1 0 2000 3 1165 2
2 13750 23 72937 90 1 0 2000 3 1165 2
3 13950 24 41711 90 1 0 2000 3 1165 2
> ## data visualization
> plot(Price~Age,data=auto)
> cor(auto$Price,auto$Age)
[1] -0.8765905
> plot(Price~KM,data=auto)
> cor(auto$Price,auto$KM)
[1] -0.5699602
> plot(Price~HP,data=auto)
> cor(auto$Price,auto$HP)
[1] 0.3149898
> plot(Price~MetColor,data=auto)
> cor(auto$Price,auto$MetColor)
[1] 0.1089048
> plot(Price~Automatic,data=auto)
> cor(auto$Price,auto$Automatic)
[1] 0.03308069
> plot(Price~CC,data=auto)
> cor(auto$Price,auto$CC)
[1] 0.165067
> plot(Price~Doors,data=auto)
> cor(auto$Price,auto$Doors)
[1] 0.1853255
> plot(Price~Weight,data=auto)
> cor(auto$Price,auto$Weight)
[1] 0.5811976
> plot(Price~FuelType1,data=auto)
Call:
lm(formula = Price ~ ., data = auto)
Residuals:
Min 1Q Median 3Q Max
-12352.0 -766.1 -2.2 755.8 6199.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.524e+03 1.296e+03 -5.804 7.98e-09 ***
Age -1.229e+02 2.633e+00 -46.686 < 2e-16 ***
KM -1.602e-02 1.330e-03 -12.048 < 2e-16 ***
HP 2.986e+01 3.003e+00 9.945 < 2e-16 ***
MetColor 3.851e+01 7.588e+01 0.508 0.6118
Automatic 1.794e+02 1.573e+02 1.140 0.2544
CC -1.368e+00 3.159e-01 -4.331 1.59e-05 ***
Doors -6.540e+01 3.952e+01 -1.655 0.0982 .
Weight 2.305e+01 1.116e+00 20.656 < 2e-16 ***
FuelType1 2.960e+02 1.629e+02 1.817 0.0694 .
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> set.seed(1)
> ## Fixing the seed value for the random selection guarantees the same results in repeated runs
> n=length(auto$Price)
> n1=900
> n2=n-n1
> n2
[1] 536
> train=sample(1:n,n1)
> ## regression on training set
> m1=lm(Price~.,data=auto[train,])
> summary(m1)
Call:
lm(formula = Price ~ ., data = auto[train, ])
Residuals:
Min 1Q Median 3Q Max
-11508.8 -736.2 6.0 732.0 6207.3
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.307e+03 1.659e+03 -3.801 0.000154 ***
Age -1.236e+02 3.445e+00 -35.879 < 2e-16 ***
KM -1.621e-02 1.790e-03 -9.058 < 2e-16 ***
HP 3.577e+01 4.317e+00 8.286 4.27e-16 ***
MetColor 1.394e+02 9.795e+01 1.423 0.154986
Automatic 3.872e+02 2.118e+02 1.828 0.067829 .
CC -1.125e+00 4.275e-01 -2.632 0.008628 **
Doors -4.876e+01 5.092e+01 -0.958 0.338526
Weight 2.139e+01 1.272e+00 16.820 < 2e-16 ***
FuelType1 1.083e+02 2.526e+02 0.429 0.668151
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> plot(m1$residuals)
> # Now predict the response variable values "Price" for the test dataset
> pred=predict(m1,newdat=auto[-train,])
> obs=auto$Price[-train]
> diff=obs-pred
> percdiff=abs(diff)/obs
> me=mean(diff)
> rmse=sqrt(sum(diff**2)/n2)
> mape=100*(mean(percdiff))
> me # mean error
[1] 59.92978
> rmse # root mean square error
[1] 1287.957
> mape # mean absolute percent error
[1] 10.13693
> ## cross-validation (leave one out)
> n=length(auto$Price)
> diff=dim(n)
> percdiff=dim(n)
> for (k in 1:n) {
+ train1=c(1:n)
+ train=train1[train1!=k]==
+ m1=lm(Price~.,data=auto[train,])
+ pred=predict(m1,newdat=auto[-train,])
+ obs=auto$Price[-train]
+ diff[k]=obs-pred
+ percdiff[k]=abs(diff[k])/obs
+}
> me=mean(diff)
> rmse=sqrt(mean(diff**2))
> mape=100*(mean(percdiff))
> me # mean error
[1] -2.494298
> rmse # root mean square error
[1] 1372.747
> mape # mean absolute percent error
[1] 9.662033
> ## cross-validation (leave one out): Model with just Age
> n=length(auto$Price)
> diff=dim(n)
> percdiff=dim(n)
> for (k in 1:n) {
+ train1=c(1:n)
+ train=train1[train1!=k]
+ m1=lm(Price~Age,data=auto[train,])
+ pred=predict(m1,newdat=auto[-train,])
+ obs=auto$Price[-train]
+ diff[k]=obs-pred
+ percdiff[k]=abs(diff[k])/obs
+}
> me=mean(diff)
> rmse=sqrt(mean(diff**2))
> mape=100*(mean(percdiff))
> me # mean error
[1] 0.6085014
> rmse # root mean square error
[1] 1748.76
> mape # mean absolute percent error
[1] 12.13156