100% found this document useful (1 vote)
84 views10 pages

Name: Reg. No.: Lab Exercise:: Shivam Batra 19BPS1131

1. The document describes applying logistic regression to predict red wine quality using a dataset containing physicochemical properties of red wines. 2. Exploratory data analysis was conducted including univariate analysis of variables and bivariate correlation analysis. The data was then prepared by handling missing values and outliers. 3. Logistic regression was performed on a training dataset to predict wine quality, which was evaluated on a test dataset, achieving an accuracy of 98% but with a poor ROC curve AUC of 0.511.

Uploaded by

Shivam Batra
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
100% found this document useful (1 vote)
84 views10 pages

Name: Reg. No.: Lab Exercise:: Shivam Batra 19BPS1131

1. The document describes applying logistic regression to predict red wine quality using a dataset containing physicochemical properties of red wines. 2. Exploratory data analysis was conducted including univariate analysis of variables and bivariate correlation analysis. The data was then prepared by handling missing values and outliers. 3. Logistic regression was performed on a training dataset to predict wine quality, which was evaluated on a test dataset, achieving an accuracy of 98% but with a poor ROC curve AUC of 0.511.

Uploaded by

Shivam Batra
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 10

CSE3506 Essentials of Data Analytics

Name : Shivam Batra


Reg. No. : 19BPS1131
Lab Exercise: 5

Objective: Applying logistic regression to predict red wine.

Methods -

1. Exploratory data analysis (EDA)

2. Data preparation

3. Modeling -> Logistic regression

4. ROC Curve

STEPS:

#Importing the dataset


data <- read.csv('winequality-red.csv', sep = ';')
str(data)

## 'data.frame': 1599 obs. of 12 variables:


## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...

#Format outcome variable


data$quality <- ifelse(data$quality >= 7, 1, 0)
data$quality <- factor(data$quality, levels = c(0, 1))

#Descriptive statistics
summary(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 0:1382
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1: 217
## Median :3.310 Median :0.6200 Median :10.20
## Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :4.010 Max. :2.0000 Max. :14.90

Univariate analysis

#Dependent variable
#Frequency plot
par(mfrow=c(1,1))
barplot(table(data[[12]]),
main = sprintf('Frequency plot of the variable: %s',
colnames(data[12])),
xlab = colnames(data[12]),
ylab = 'Frequency')
#Check class BIAS
table(data$quality)

##
## 0 1
## 1382 217

round(prop.table((table(data$quality))),2)

##
## 0 1
## 0.86 0.14

#Independent variable
#Boxplots
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
boxplot(x = data[i],
horizontal = TRUE,
main = sprintf('Boxplot of the variable: %s',
colnames(data[i])),
xlab = colnames(data[i]))
}
#Histograms
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
hist(x = data[[i]],
main = sprintf('Histogram of the variable: %s',
colnames(data[i])),
xlab = colnames(data[i]))
}
Bivariate analysis

#Correlation matrix
library(ggcorrplot)

## Loading required package: ggplot2

ggcorrplot(round(cor(data[-12]), 2),
type = "lower",
lab = TRUE,
title =
'Correlation matrix of the red wine quality dataset')
Data preparation

#Missing values
sum(is.na(data))

## [1] 0

#Outliers
#Identifing outliers
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) |
x > quantile(x, 0.75) + 1.5 * IQR(x))
}
outlier <- data.frame(variable = character(),
sum_outliers = integer(),
stringsAsFactors=FALSE)
for (j in 1:(length(data)-1)){
variable <- colnames(data[j])
for (i in data[j]){
sum_outliers <- sum(is_outlier(i))
}
row <- data.frame(variable,sum_outliers)
outlier <- rbind(outlier, row)
}
outlier

## variable sum_outliers
## 1 fixed.acidity 49
## 2 volatile.acidity 19
## 3 citric.acid 1
## 4 residual.sugar 155
## 5 chlorides 112
## 6 free.sulfur.dioxide 30
## 7 total.sulfur.dioxide 55
## 8 density 45
## 9 pH 35
## 10 sulphates 59
## 11 alcohol 13

#Identifying the percentage of outliers


for (i in 1:nrow(outlier)){
if (outlier[i,2]/nrow(data) * 100 >= 5){
print(paste(outlier[i,1],
'=',
round(outlier[i,2]/nrow(data) * 100, digits = 2),
'%'))
}
}

## [1] "residual.sugar = 9.69 %"


## [1] "chlorides = 7 %"

#Inputting outlier values


for (i in 4:5){
for (j in 1:nrow(data)){
if (data[[j, i]] > as.numeric(quantile(data[[i]], 0.75) +
1.5 * IQR(data[[i]]))){
if (i == 4){
data[[j, i]] <- round(mean(data[[i]]), digits = 2)
} else{
data[[j, i]] <- round(mean(data[[i]]), digits = 3)
}
}
}
}

Modeling
#Splitting the dataset into the Training set and Test set
#Stratified sample
data_ones <- data[which(data$quality == 1), ]
data_zeros <- data[which(data$quality == 0), ]
#Train data
set.seed(123)
train_ones_rows <- sample(1:nrow(data_ones), 0.8*nrow(data_ones))
train_zeros_rows <- sample(1:nrow(data_zeros), 0.8*nrow(data_ones))
train_ones <- data_ones[train_ones_rows, ]
train_zeros <- data_zeros[train_zeros_rows, ]
train_set <- rbind(train_ones, train_zeros)
table(train_set$quality)

##
## 0 1
## 173 173

#Test Data
test_ones <- data_ones[-train_ones_rows, ]
test_zeros <- data_zeros[-train_zeros_rows, ]
test_set <- rbind(test_ones, test_zeros)
table(test_set$quality)

##
## 0 1
## 1209 44

Logistic regression

#Logistic Regression
lr = glm(formula = quality ~.,
data = train_set,
family = binomial)
#Predictions
prob_pred = predict(lr,
type = 'response',
newdata = test_set[-12])
library(InformationValue)
optCutOff <- optimalCutoff(test_set$quality, prob_pred)[1]
y_pred = ifelse(prob_pred > optCutOff, 1, 0)

#Making the confusion matrix


cm_lr = table(test_set[, 12], y_pred)
cm_lr

## y_pred
## 0 1
## 0 1207 2
## 1 43 1

#Accuracy
accuracy_lr = (cm_lr[1,1] + cm_lr[1,1])/
(cm_lr[1,1] + cm_lr[1,1] + cm_lr[2,1] + cm_lr[1,2])
accuracy_lr

## [1] 0.9816999

#ROC curve
library(ROSE)

## Loaded ROSE 0.0-4

par(mfrow = c(1, 1))


roc.curve(test_set$quality, y_pred)

## Area under the curve (AUC): 0.511

You might also like