9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
Data Wrangling (Data Preprocessing) Code
Mid-term assessment
Siddharth Dinkar Raul (s4015125)
18-09-2023
Setup
Hide
# Load the necessary packages required to reproduce the report.
library(tibble)
library(dplyr)
library(lubridate)
Data generation
Hide
file:///C:/Users/SIDDHARTH/Downloads/Data Wrangling 2/Mid-term-Assessment-Rmarkdown-Template.nb.html 1/4
9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
# Data generation, provide your R codes
# Generating date range
start_date <- as.Date("2023-01-01")
end_date <- as.Date("2023-12-31")
date_range <- seq(start_date, end_date, by = "days")
# Setting the seed
set.seed(285)
# Creating the first dataset ( Sales dataset)
sales_data <- tibble(
date = sample(date_range, 150, replace = TRUE),
product_id = sample(1:200, 150, replace = TRUE),
product_name = as.character(replicate(150, paste(sample(words, 2), collapse = " "))),
quantity_sold = as.numeric(sample(1:20, 150, replace = TRUE)),
price = as.numeric(runif(150, min = 50, max = 500)),
customer_id = as.factor(sample(1:500, 150, replace = TRUE)),
store_id = as.factor(sample(1:5, 150, replace = TRUE)) # Common variable "store_id"
)
# Introducing the missing values in the "price" column (approximately 5%)
sales_data[sample(1:150, 5), "price"] <- NA
# Introducing outliers
sales_data[sample(1:150, 5), "quantity_sold"] <- sales_data[sample(1:150, 5), "quantity_sol
d"] * 10
sales_data[sample(1:150, 5), "price"] <- sales_data[sample(1:150, 5), "price"] * 2
# Exporting to CSV
write.csv(sales_data, "sales_data.csv", row.names = FALSE)
# Creating second dataset ( Customer Dataset)
set.seed(286)
customer_data <- tibble(
customer_id = as.factor(1:200),
customer_name = as.character(replicate(200, paste(sample(LETTERS, 5), collapse = ""))),
email = as.character(paste0(replicate(200, paste(sample(letters, 5), collapse = "")), "@exa
mple.com")),
total_purchases = as.numeric(sample(100:1000, 200, replace = TRUE)),
is_member = as.logical(sample(c(TRUE, FALSE), 200, replace = TRUE, prob = c(0.6, 0.4))),
store_id = as.factor(sample(1:5, 200, replace = TRUE)) # Common variable "store_id"
)
# Introduce missing values in the "email" column (approximately 5%)
customer_data[sample(1:200, 10), "email"] <- NA
# Export to CSV
write.csv(customer_data, "customer_data.csv", row.names = FALSE)
file:///C:/Users/SIDDHARTH/Downloads/Data Wrangling 2/Mid-term-Assessment-Rmarkdown-Template.nb.html 2/4
9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
# Creating second dataset ( Customer Dataset)
# Create an inventory dataset
set.seed(789)
inventory_data <- tibble(
product_id = as.factor(1:200),
product_name = as.character(replicate(200, paste(sample(words, 2), collapse = " "))),
stock_level = as.numeric(sample(1:100, 200, replace = TRUE)),
supplier = as.character(replicate(200, paste(sample(LETTERS, 3), collapse = ""))),
cost_price = as.numeric(runif(200, min = 50, max = 200)),
selling_price = as.numeric(runif(200, min = 100, max = 500)),
store_id = as.factor(sample(1:5, 200, replace = TRUE)) # Common variable "store_id"
)
# Introduce missing values in the "stock_level" column (approximately 5%)
inventory_data[sample(1:200, 10), "stock_level"] <- NA
# Introduce outliers
inventory_data[sample(1:200, 5), "cost_price"] <- inventory_data[sample(1:200, 5), "cost_pric
e"] * 0.5
inventory_data[sample(1:200, 5), "selling_price"] <- inventory_data[sample(1:200, 5), "sellin
g_price"] * 2
# Export to CSV
write.csv(inventory_data, "inventory_data.csv", row.names = FALSE)
Provide explanations here.
Merging data sets
Hide
# Merge your synthetic data sets, provide R codes here.
Provide explanations here.
Checking structure of combined data
Hide
# Check structure of combined data and perform all necessary data type conversions, provide R
codes here.
Provide explanations here.
Generate summary statistics
Hide
# Generate summary statistics, provide R codes here.
file:///C:/Users/SIDDHARTH/Downloads/Data Wrangling 2/Mid-term-Assessment-Rmarkdown-Template.nb.html 3/4
9/18/23, 7:29 PM Data Wrangling (Data Preprocessing)
Provide explanations here.
Scanning data
Hide
# Scan variables for missing values, provide R codes here.
Provide explanations here.
file:///C:/Users/SIDDHARTH/Downloads/Data Wrangling 2/Mid-term-Assessment-Rmarkdown-Template.nb.html 4/4