Ashutosh Anand
DL Assignment Lab 3 SuperStore
202318035
DATASET 1: SUPERSTORE
In [ ]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
In [ ]: # Load the dataset, specifying the encoding format
superstore_data = pd.read_csv('/content/Sample - Superstore.csv', encoding='ISO-8859-1')
# Preview the first few rows of the dataset
superstore_data.head()
Out[ ]: Row Order Order Ship Customer Customer
Ship Date Segment Country Ci
ID ID Date Mode ID Name
CA-
Second Claire United
0 1 2016- 11/8/2016 11/11/2016 CG-12520 Consumer Henderso
Class Gute States
152156
CA-
Second Claire United
1 2 2016- 11/8/2016 11/11/2016 CG-12520 Consumer Henderso
Class Gute States
152156
CA-
Second Darrin United L
2 3 2016- 6/12/2016 6/16/2016 DV-13045 Corporate
Class Van Huff States Angel
138688
US-
Standard Sean United Fo
3 4 2015- 10/11/2015 10/18/2015 SO-20335 Consumer
Class O'Donnell States Lauderda
108966
US-
Standard Sean United Fo
4 5 2015- 10/11/2015 10/18/2015 SO-20335 Consumer
Class O'Donnell States Lauderda
108966
5 rows × 21 columns
In [ ]: # Display the count of missing values in each column
print(superstore_data.isna().sum())
# Remove rows where the 'Profit' column has missing values
superstore_data_cleaned = superstore_data.dropna(subset=['Profit'])
Row ID 0
Order ID 0
Order Date 0
Ship Date 0
Ship Mode 0
Customer ID 0
Customer Name 0
Segment 0
Country 0
City 0
State 0
Postal Code 0
Region 0
Product ID 0
Category 0
Sub-Category 0
Product Name 0
Sales 0
Quantity 0
Discount 0
Profit 0
dtype: int64
In [ ]: # Convert 'Order Date' and 'Ship Date' to datetime format, handling errors
superstore_data['Order Date'] = pd.to_datetime(superstore_data['Order Date'], format='%Y-%m-%d
superstore_data['Ship Date'] = pd.to_datetime(superstore_data['Ship Date'], format='%Y-%m-%d'
In [ ]: # Filter out rows with non-positive 'Sales' or 'Profit' values
superstore_data_filtered = superstore_data[(superstore_data['Sales'] > 0) & (superstore_data[
In [ ]: # Statistical summary of the dataset
superstore_data.describe()
Out[ ]: Row ID Order Date Ship Date Postal Code Sales Quantity
count 8058.000000 8058 8058 8058.000000 8058.000000 8058.000000 8
2016-05-01 2016-05-04
mean 4967.272648 55016.047779 223.480623 3.794738
01:06:07.237527808 23:59:49.277736704
2014-01-03 2014-01-07
min 1.000000 1040.000000 0.990000 1.000000
00:00:00 00:00:00
2015-05-26 2015-05-29
25% 2483.250000 21843.250000 17.940000 2.000000
00:00:00 00:00:00
2016-06-28 2016-07-02
50% 4965.000000 53711.000000 50.965000 3.000000
00:00:00 00:00:00
2017-05-15 2017-05-19
75% 7427.250000 90045.000000 191.976000 5.000000
00:00:00 18:00:00
2017-12-30 2018-01-05
max 9994.000000 99301.000000 17499.950000 14.000000
00:00:00 00:00:00
std 2872.723630 NaN NaN 33295.003380 600.340641 2.244696
In [ ]: superstore_data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8058 entries, 0 to 9993
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Row ID 8058 non-null int64
1 Order ID 8058 non-null object
2 Order Date 8058 non-null datetime64[ns]
3 Ship Date 8058 non-null datetime64[ns]
4 Ship Mode 8058 non-null object
5 Customer ID 8058 non-null object
6 Customer Name 8058 non-null object
7 Segment 8058 non-null object
8 Country 8058 non-null object
9 City 8058 non-null object
10 State 8058 non-null object
11 Postal Code 8058 non-null int64
12 Region 8058 non-null object
13 Product ID 8058 non-null object
14 Category 8058 non-null object
15 Sub-Category 8058 non-null object
16 Product Name 8058 non-null object
17 Sales 8058 non-null float64
18 Quantity 8058 non-null int64
19 Discount 8058 non-null float64
20 Profit 8058 non-null float64
dtypes: datetime64[ns](2), float64(3), int64(3), object(13)
memory usage: 1.4+ MB
In [ ]: # Plotting the correlation heatmap for numerical columns
plt.figure(figsize=(10, 6))
numeric_cols = superstore_data_filtered.select_dtypes(include='number') # Extract numerical c
correlation_matrix = numeric_cols.corr()
# Generate the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heatmap of Correlation Matrix')
plt.show()
In [ ]: # Scatter plot to visualize relationship between 'Sales' and 'Profit'
plt.figure(figsize=(10, 6))
plt.scatter(superstore_data_filtered['Sales'], superstore_data_filtered['Profit'], alpha=0.5,
plt.title('Relationship Between Sales and Profit')
plt.xlabel('Sales Amount')
plt.ylabel('Profit Earned')
plt.grid(True)
plt.show()
In [ ]: # Apply one-hot encoding to categorical features
encoded_data = pd.get_dummies(superstore_data_filtered, columns=['Ship Mode', 'Segment', 'Cate
# Drop unnecessary columns that won't be used in the model
columns_to_drop = ['Order ID', 'Customer ID', 'Customer Name', 'City', 'State', 'Postal Code'
encoded_data_cleaned = encoded_data.drop(columns=columns_to_drop)
# Define the feature matrix (X) and target variable (y)
X = encoded_data_cleaned.drop(columns=['Profit'])
y = encoded_data_cleaned['Profit']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Standardize the feature matrix
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [ ]: # Initialize and train the linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train_scaled, y_train)
# Make predictions using the test set
predicted_values = linear_regressor.predict(X_test_scaled)
In [ ]: # Calculate performance metrics
mean_sq_error = mean_squared_error(y_test, predicted_values)
r_squared = r2_score(y_test, predicted_values)
# Display the results
print(f"Mean Squared Error (MSE): {mean_sq_error}")
print(f"R-squared (R²): {r_squared}")
Mean Squared Error: 10086.950159098482
R-squared: 0.6454160278125762
In [ ]: # Residual plot to visualize the difference between actual and predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test - predicted_values, alpha=0.5, color='blue')
plt.title('Residual Plot: Actual vs Residuals')
plt.xlabel('Actual Profit')
plt.ylabel('Residuals (Actual - Predicted)')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.grid(True)
plt.show()
In [ ]: import tensorflow as tf
# Define the neural network model
neural_network = tf.keras.Sequential([
tf.keras.layers.Dense(64, input_shape=(X_train_scaled.shape[1],), activation='relu'),
tf.keras.layers.BatchNormalization(), # Apply batch normalization after the first dense
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.BatchNormalization(), # Apply batch normalization after the second dense
tf.keras.layers.Dense(1) # Output layer for predicting a continuous value
])
# Compile the model with the Adam optimizer and mean squared error loss
neural_network.compile(optimizer='adam', loss='mean_squared_error')
# Fit the model to the training data with validation split
training_history = neural_network.fit(X_train_scaled, y_train, epochs=50, batch_size=32, valid
# Predict values on the test set
predicted_values_nn = neural_network.predict(X_test_scaled)
# Calculate and print the Mean Squared Error for the model
mean_squared_error_nn = mean_squared_error(y_test, predicted_values_nn)
print(f"Mean Squared Error (NN): {mean_squared_error_nn}")
/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/dense.py:87: UserWarning: Do not
pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer us
ing an `Input(shape)` object as the first layer in the model instead.
super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Epoch 1/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 3s 4ms/step - loss: 47668.7617 - val_loss: 51688.6797
Epoch 2/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - loss: 64383.8750 - val_loss: 46871.4219
Epoch 3/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 33989.1836 - val_loss: 41950.8047
Epoch 4/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 40321.8438 - val_loss: 37491.6797
Epoch 5/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 35579.7109 - val_loss: 31717.7617
Epoch 6/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 29560.2715 - val_loss: 28373.3945
Epoch 7/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 34109.1094 - val_loss: 25534.1855
Epoch 8/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 51172.4766 - val_loss: 20007.9590
Epoch 9/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 35456.3477 - val_loss: 17684.2188
Epoch 10/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 27014.1738 - val_loss: 16791.4980
Epoch 11/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 25342.3340 - val_loss: 15307.5625
Epoch 12/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 19282.0391 - val_loss: 13753.0234
Epoch 13/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 32910.4805 - val_loss: 9830.8936
Epoch 14/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 28342.3164 - val_loss: 8664.4775
Epoch 15/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 21314.8379 - val_loss: 7692.1582
Epoch 16/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 29441.5254 - val_loss: 5153.7827
Epoch 17/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - loss: 20392.1309 - val_loss: 5785.3110
Epoch 18/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 30496.3496 - val_loss: 3993.5117
Epoch 19/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - loss: 27584.6875 - val_loss: 3594.0852
Epoch 20/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 20475.7793 - val_loss: 3755.9656
Epoch 21/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 61953.0586 - val_loss: 1671.9246
Epoch 22/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 26770.6309 - val_loss: 2593.4043
Epoch 23/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 10425.8662 - val_loss: 2277.5725
Epoch 24/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 19270.7734 - val_loss: 1250.8818
Epoch 25/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 14116.5947 - val_loss: 1505.1302
Epoch 26/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 14105.9600 - val_loss: 1167.1968
Epoch 27/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 25164.5859 - val_loss: 2049.8232
Epoch 28/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 26295.7676 - val_loss: 2879.0378
Epoch 29/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 28493.4883 - val_loss: 1048.2556
Epoch 30/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 17539.0742 - val_loss: 4008.9102
Epoch 31/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 8479.7100 - val_loss: 1161.5667
Epoch 32/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 15669.1748 - val_loss: 1503.3073
Epoch 33/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 28940.2637 - val_loss: 1834.2437
Epoch 34/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 13616.1494 - val_loss: 1704.3368
Epoch 35/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 5860.9961 - val_loss: 2200.2449
Epoch 36/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 16617.9785 - val_loss: 4047.9802
Epoch 37/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 32050.4023 - val_loss: 2532.6729
Epoch 38/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 16179.7148 - val_loss: 4513.4199
Epoch 39/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 21568.2441 - val_loss: 5153.5933
Epoch 40/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 24783.0625 - val_loss: 5456.6978
Epoch 41/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 15170.2783 - val_loss: 1733.7618
Epoch 42/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 32876.7031 - val_loss: 1131.5521
Epoch 43/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 14524.8955 - val_loss: 1701.3094
Epoch 44/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - loss: 9640.7227 - val_loss: 4312.9932
Epoch 45/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 15651.7090 - val_loss: 2375.5364
Epoch 46/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 14777.8525 - val_loss: 3953.7329
Epoch 47/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 5551.8149 - val_loss: 4933.4780
Epoch 48/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 6877.3472 - val_loss: 2246.7551
Epoch 49/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 14761.7148 - val_loss: 8128.3403
Epoch 50/50
162/162 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 13291.8027 - val_loss: 6345.6904
51/51 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
Mean Squared Error: 6065.888140869391
In [ ]: # Calculate R-squared for the model predictions
r_squared_nn = r2_score(y_test, predicted_values_nn)
print(f"R-squared (NN): {r_squared_nn}")
R-squared: 0.7867673897551717
In [ ]: import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
In [ ]: import torch
from torch.utils.data import DataLoader, TensorDataset
# Convert NumPy arrays to PyTorch tensors with appropriate data types
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1) # Reshape tar
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)
# Create datasets and dataloaders for training and testing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
In [ ]: import torch
import torch.nn as nn
class FeedForwardNN(nn.Module):
def __init__(self):
super(FeedForwardNN, self).__init__()
# Define the network layers
self.fc1 = nn.Linear(X_train_tensor.shape[1], 64)
self.bn1 = nn.BatchNorm1d(64)
self.fc2 = nn.Linear(64, 64)
self.bn2 = nn.BatchNorm1d(64)
self.fc_out = nn.Linear(64, 1)
def forward(self, x):
# Forward pass through the network
x = torch.relu(self.bn1(self.fc1(x)))
x = torch.relu(self.bn2(self.fc2(x)))
x = self.fc_out(x)
return x
# Create an instance of the model
model_instance = FeedForwardNN()
In [ ]: import torch.optim as optim
# Set up the loss function and optimizer
loss_function = nn.MSELoss() # Mean Squared Error for regression tasks
optimizer = optim.Adam(model_instance.parameters(), lr=0.001)
In [ ]: # Training the neural network model
num_epochs = 50
for epoch in range(num_epochs):
model_instance.train() # Set model to training mode
total_loss = 0.0
for batch_inputs, batch_targets in train_loader:
optimizer.zero_grad() # Reset gradients
predictions = model_instance(batch_inputs) # Perform forward pass
loss = loss_function(predictions, batch_targets) # Calculate loss
loss.backward() # Compute gradients
optimizer.step() # Update model parameters
total_loss += loss.item()
# Print average loss for the epoch
print(f'Epoch {epoch+1}/{num_epochs}, Average Loss: {total_loss / len(train_loader):.4f}'
Epoch 1/50, Loss: 51424.657269770556
Epoch 2/50, Loss: 48310.33356105691
Epoch 3/50, Loss: 44932.61543251264
Epoch 4/50, Loss: 41313.72389772623
Epoch 5/50, Loss: 39324.14825605638
Epoch 6/50, Loss: 36726.14981592764
Epoch 7/50, Loss: 35337.64955350668
Epoch 8/50, Loss: 34768.07926850272
Epoch 9/50, Loss: 33027.254523400035
Epoch 10/50, Loss: 31722.777753622224
Epoch 11/50, Loss: 30838.23672727075
Epoch 12/50, Loss: 30333.480208746278
Epoch 13/50, Loss: 29343.359767498358
Epoch 14/50, Loss: 29310.228390079912
Epoch 15/50, Loss: 28139.486955699354
Epoch 16/50, Loss: 27291.915266886797
Epoch 17/50, Loss: 26410.431213529984
Epoch 18/50, Loss: 26361.390899658203
Epoch 19/50, Loss: 26027.35403684106
Epoch 20/50, Loss: 25200.32362018245
Epoch 21/50, Loss: 25254.22382528475
Epoch 22/50, Loss: 24776.36201099358
Epoch 23/50, Loss: 25340.636858647413
Epoch 24/50, Loss: 23679.249445395893
Epoch 25/50, Loss: 24117.204031160563
Epoch 26/50, Loss: 23072.884546261022
Epoch 27/50, Loss: 22699.725368084295
Epoch 28/50, Loss: 22319.26638869484
Epoch 29/50, Loss: 21812.148398522102
Epoch 30/50, Loss: 21496.75541520827
Epoch 31/50, Loss: 21163.72158677507
Epoch 32/50, Loss: 21015.17931615244
Epoch 33/50, Loss: 20904.659133457902
Epoch 34/50, Loss: 20409.528865398748
Epoch 35/50, Loss: 20188.139873542408
Epoch 36/50, Loss: 19548.72394433352
Epoch 37/50, Loss: 18877.12769566904
Epoch 38/50, Loss: 18564.116300073
Epoch 39/50, Loss: 17895.596304260856
Epoch 40/50, Loss: 18856.125500669576
Epoch 41/50, Loss: 18531.74821426845
Epoch 42/50, Loss: 17332.796170376314
Epoch 43/50, Loss: 17418.803755354173
Epoch 44/50, Loss: 17068.957090774384
Epoch 45/50, Loss: 16144.931223501073
Epoch 46/50, Loss: 15548.156839956151
Epoch 47/50, Loss: 15317.066974904277
Epoch 48/50, Loss: 15231.355718631556
Epoch 49/50, Loss: 14943.487428721815
Epoch 50/50, Loss: 14945.049020597251
In [ ]: # Set the model to evaluation mode
model_instance.eval()
predicted_values_test = []
# Disable gradient computation for evaluation
with torch.no_grad():
for batch_inputs, _ in test_loader:
predictions = model_instance(batch_inputs)
predicted_values_test.append(predictions)
# Concatenate the predictions and convert them to numpy array
predicted_values_test = torch.cat(predicted_values_test).numpy()
# Compute and display the evaluation metrics
test_mse = mean_squared_error(y_test, predicted_values_test)
print(f'Mean Squared Error on Test Data: {test_mse:.4f}')
test_r2 = r2_score(y_test, predicted_values_test)
print(f"R-squared on Test Data: {test_r2:.4f}")
Mean Squared Error on Test Set: 2675.018969037517
R-squared on Test Set: 0.9059657441786345