0% found this document useful (0 votes)
14 views93 pages

DVPD Final Lab Word PDF

Uploaded by

Yogesh Garg
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views93 pages

DVPD Final Lab Word PDF

Uploaded by

Yogesh Garg
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 93

Data Visualization for Product Design Laboratory

(CSDC – 023)
3rd Semester Computer Science and Engineering
Prepared during (Jun- Dec 2024) By
Name: Ravi Nautiyal
Roll No:23103117
Submitted to : Mr. Vishal Kumar Verma

Department of Computer Science and Engineering


DR. B.R. AMBEDKAR NATIONAL INSTITUTE
OFTECHNOLOGY JALANDHAR, PUNJAB-144011
S. WEEK NAME OF PRACTICAL Submission Signature
No. No. Date

Assignment 01a - Introduction to data visualization Using Excel 31.07.2024


1. WEEK 1
Assignment 01b - Introduction to data visualization Using Excel 10.08.2024

Assignment 02 - Data Visualization in Python Using Experiment 17.08.2024


2. WEEK 2
1 Techniques.

3. WEEK 3 Lab 03- R installation 18.08.2024

Lab 03 – R libraries installation 29.08.2024


4. WEEK 4

Data visualization using R 04.09.2024


5. WEEK 5

Data Preprocessing (Binning) 10.09.2024


6. WEEK 6

7. WEEK 7 Data Preprocessing using Python and R 17.09.2024

Installation of MySQL workbench and creation of 22.09.2024


8. WEEK 8
basic schema

9. WEEK 9 ER diagram 25-09-2024

9. WEEK 9 Concepts of Keys in MySQL 17.10.2024

10. WEEK 10 PCA assignment 09.11.2024

11. WEEK 11 PCA02 assignment 09.11.2024

12. WEEK 12 Implementation of linear regression 19.11.2024


Assignment 01 a
(Introduction to data visualization using excel)
Assignment 01 b
Introduction to data visualization using excel
Assignment 02
Data Visualization in Python Using Experiment 1
Techniques.

Bar chart

import pandas as pd import plotly.express as px

import matplotlib.pyplot as plt data=[[1, 25, 30, 50, 1], [20, 1, 60, 80, 30], [30,
60, 1, 5, 20]]
# reading the database
fig = px.imshow(data,
data = pd.read_csv("tips.csv")
labels=dict(x="Day of Week", y="Time
# Bar chart with day against tip
of Day", color="Productivity"),
plt.bar(data['day'], data['tip'])
x=['Monday', 'Tuesday', 'Wednesday',
plt.title("Bar Chart") 'Thursday', 'Friday'],

# Setting the X and Y labels y=['Morning', 'Afternoon', 'Evening']

plt.xlabel('Day') )

plt.ylabel('Tip') fig.update_xaxes(side="top")

# Adding the legends fig.show()

plt.show()

Line Chart

# importing packages
//Heat map import seaborn as sns
import matplotlib.pyplot as plt data = pd.read_csv("tips.csv")

import pandas as pd sns.lineplot(x='day', y='tip', data=data)

# reading the database plt.show()

Histogram

import pandas as pd import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

# reading the database # reading the database

data = pd.read_csv("tips.csv") data = pd.read_csv("tips.csv")

# histogram of total_bills

plt.hist(data['total_bill']) # Scatter plot with day against tip

plt.xlabel('Day') plt.scatter(data['day'], data['tip'], c=data['size'],

plt.ylabel('Tip') s=data['total_bill'])

plt.title("Histogram")

# Adding the legends # Adding Title to the Plot

plt.show() plt.title("Scatter Plot")

# //scatter Plot # Setting the X and Y labels

import pandas as pd plt.xlabel('Day')


plt.ylabel('Tip')

plt.show()

plt.colorbar()

import plotly.graph_objects as go

# Example dataset node=dict(

source = [0, 0, 1, 1, 2, 2] # A, A, B, B, C, C pad=15,

target = [3, 4, 3, 5, 4, 5] # X, Y, X, Z, Y, Z thickness=20,

value = [10, 20, 15, 5, 25, 30] line=dict(color="black", width=0.5),

# Labels for nodes label=labels,

labels = ['A', 'B', 'C', 'X', 'Y', 'Z'] ),

# Create the Sankey diagram fig.update_layout(title_text="Sankey Diagram


Example", font_size=10)
fig = go.Figure(go.Sankey(
fig.show()
//Area Plot df_usa = df[(df['country'] == 'United States') &
(df['year'] > 2000)]
import pandas as pd
# Plotting Area Plot
import matplotlib.pyplot as plt
plt.fill_between(df_usa['year'],
# Sample Data
df_usa['primary_energy_consumption'],
url = color="skyblue", alpha=0.4)
"https://raw.githubusercontent.com/owid/ener
plt.title('Energy Consumption Over Time in the
gy-data/master/owid-energy-data.csv"
USA')
df = pd.read_csv(url)
plt.xlabel('Year')
# Filter data for a specific country and year
plt.ylabel('Energy Consumption (TWh)')
range (e.g., USA)
plt.show()
import pandas as pd category_sizes = category_percentages * 100

import matplotlib.pyplot as plt # Creating the Waffle chart grid

# Example dataset (simplified since pywaffle n_rows = 10


isn't available)
n_cols = 10
data = {'division': ['East', 'West', 'North',
grid = [0] * (n_rows * n_cols)
'South'],
# Assign values to the grid
'count': [50, 30, 10, 10]}
start = 0
# Convert data into a DataFrame
# Plotting the Waffle chart
df = pd.DataFrame(data)
for i in range(n_rows):
# Create a Waffle chart manually with
Matplotlib for j in range(n_cols):
fig, ax = plt.subplots(figsize=(6, 6)) ax.add_patch(plt.Rectangle((j, i), 1, 1,
color=colors[grid[i, j]]))
total_values = df['count'].sum()
# Add legend
category_percentages = df['count'] /
total_values labels = df['division']
handles = [plt.Rectangle((0 # Plotting

import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(10, 6))

import pandas as pd # Add labels

# Sample data ax.set_xticks(range(len(data['categories'])))

data = pd.DataFrame({ ax.set_xticklabels(data['categories'])

'categories': ['Start', 'Revenue', 'Cost', 'Profit', ax.set_xlabel('Categories')


'End'],
ax.set_ylabel('Values')
'values': [0, 100, -30, 70, 0]
ax.set_title('Waterfall Chart')
})
plt.grid(True)
# Calculate the cumulative values
plt.tight_layout()
data['cumulative'] = data['values'].cumsum()
plt.savefig('waterfall_chart.png')
data['start'] =
plt.show()
data['cumulative'].shift(1).fillna(0)

data['end'] = data['cumulative']

Assignment 03 (R and R libraries installation)


Package Installation of esquisse
package installation of plotly

Package installation of ggplot2


Package installation of lattice

Package installation of rgl


Package installation of dygraphs

Package installation of leaflet


Package installation of rayrender

Package installation of highcharter


Package installation of ggvis

Package installation of tidyr


Data visualization using R
Control Statements
Histogram
BoxPlot: It is used for summarizing the data, calculating the extreme points or median

Scatterplot is used to define the relation between two units using the dots.

Heatmap: It is used to display the visual representation of data using the colors
3D: R gives us the option to display 3d charts
Line Chart.

Bar Chart:
Pie Chart

Histogram
Bubble Chart
Waterfall Chart

Sankey Diagram

> library(plotly)

Loading required package: ggplot2 The following object is masked from


‘package:graphics’:

Attaching package: ‘plotly’


layout

The following object is masked from


‘package:ggplot2’: > library(dplyr)

last_plot Attaching package: ‘dplyr’

The following object is masked from The following objects are masked from
‘package:stats’: ‘package:stats’:

filter filter, lag


+ mutate(target = "World")

The following objects are masked from >


‘package:base’:
> nodes <- data.frame(name =
c(unique(vacc_summary$location), "World"))

intersect, setdiff, setequal, union >

> links <- data.frame(

> + source = match(vacc_summary$location,


nodes$name) - 1,
> # Fetch real-time COVID-19 vaccination data
+ target = match(vacc_summary$target,
> url <-
nodes$name) - 1,
"https://raw.githubusercontent.com/owid/covi
d-19- + value = vacc_summary$total_vaccinations
data/master/public/data/vaccinations/vaccinati
+)
ons.csv"
>
> vacc_data <- read.csv(url)
> # Sankey Diagram with Plotly
>
> p <- plot_ly(
> # Prepare data for Sankey Diagram
+ type = "sankey",
> vacc_summary <- vacc_data %>%
+ node = list(label = nodes$name),
+ filter(location %in% c("United States",
"India", "Brazil", "Russia", "United Kingdom")) + link = list(source = links$source, target =
%>% links$target, value = links$value)
+ group_by(location) %>% +)
+ summarize(total_vaccinations = >
max(total_vaccinations, na.rm = TRUE)) %>%
3D scatter Plot
Leaflet Map Visualization
Bubble Plot
Library Ggvis
Use of Library Lattice
Data Preprocessing
Preprocessing with R and python
Installation of MySQL workbench and creation of basic schema
Submission of ER diagram
Concepts of Keys in MySQL
PCA assignment
For Dataset of Heart.csv
1. Find the number of different items or tuples per class.

2. Split the whole data and see if there are any obvious patterns (correlations between

different variables). Use scatter plot to see the distribution of data.

import csv with open(filename, 'r') as file:

import math reader = csv.reader(file)

import matplotlib.pyplot as plt next(reader) # Skip the header

for row in reader:

# Load the CSV file manually # Ensure we only extract numeric values,
converting where possible
def load_csv(filename):
try:
data = []
data.append([float(x) for x in row]) # Extract individual columns (Adjust indices
based on actual dataset structure)
except ValueError:
age = [row[0] for row in data] # Assuming age
print(f"Skipping row with non-numeric
is in the 1st column
values: {row}")
sex = [row[1] for row in data] # Assuming sex is
return data
in the 2nd column
# Compute the mean of a list
trtbps = [row[3] for row in data] # Assuming
def mean(data): trtbps (blood pressure) is in the 4th column

return sum(data) / len(data) chol = [row[4] for row in data] # Assuming


cholesterol is in the 5th column
# Compute the standard deviation of a list
thalach = [row[7] for row in data] # Assuming
def std_dev(data): max heart rate (thalach) is in the 8th column
mu = mean(data) target = [row[-1] for row in data] # Assuming
variance = sum((x - mu) ** 2 for x in data) / target is the last column
len(data)

return math.sqrt(variance) # Manually compute the correlation matrix


# Compute the correlation between two lists between selected variables

def correlation(x, y): corr_matrix = [[correlation(age, age),


correlation(age, sex), correlation(age, trtbps),
mu_x = mean(x) correlation(age, chol), correlation(age,
mu_y = mean(y) thalach)],

std_x = std_dev(x) [correlation(sex, age), correlation(sex,


sex), correlation(sex, trtbps), correlation(sex,
std_y = std_dev(y) chol), correlation(sex, thalach)],
cov = sum((x[i] - mu_x) * (y[i] - mu_y) for i in [correlation(trtbps, age),
range(len(x))) / len(x) correlation(trtbps, sex), correlation(trtbps,
trtbps), correlation(trtbps, chol),
return cov / (std_x * std_y)
correlation(trtbps, thalach)],
# Load the dataset from a specific path
[correlation(chol, age),
# Replace the path with your correct path to the correlation(chol, sex), correlation(chol, trtbps),
heart.csv file correlation(chol, chol), correlation(chol,
thalach)],
data = load_csv(r'C:\Users\Ravi
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P [correlation(thalach, age),
CA\heart.csv') correlation(thalach, sex), correlation(thalach,
trtbps), correlation(thalach, chol),
correlation(thalach, thalach)]]
# Print the computed correlation matrix plt.ylabel(y_label)

print("Correlation Matrix:") plt.grid(True)

for row in corr_matrix: plt.show()

print(row)

# Create scatter plots between various variables

# Scatter plot function scatter_plot(age, trtbps, 'Age', 'Resting Blood


Pressure')
def scatter_plot(x, y, x_label, y_label):
scatter_plot(age, chol, 'Age', 'Cholesterol')
plt.scatter(x, y, alpha=0.5)
scatter_plot(chol, thalach, 'Cholesterol', 'Max
plt.title(f'Scatter plot of {x_label} vs
Heart Rate Achieved')
{y_label}')

plt.xlabel(x_label)

3. Let’s shrink the whole dataset down to 2 dimensions (with as little loss as possible) with

the help of PCA.

Note that while computing the principal components, you have to find the Covariance
Matrix, unit eigenvectors and print them as well.

import csv # Compute the covariance matrix

import numpy as np def compute_covariance_matrix(data):

# Load the CSV file manually return np.cov(data.T)

def load_csv(filename): # Compute eigenvalues and eigenvectors

data = [] def compute_eigen(cov_matrix):

with open(filename, 'r') as file: eigenvalues, eigenvectors =


np.linalg.eig(cov_matrix)
reader = csv.reader(file)
return eigenvalues, eigenvectors
next(reader) # Skip the header
# Project the data onto the top k eigenvectors
for row in reader:
def project_data(data, eigenvectors, k):
try:
return np.dot(data, eigenvectors[:, :k])
# Try converting each element to a
float, and append if successful # Load the dataset (Replace with your actual
CSV path)
data.append([float(x) for x in row])
data = load_csv(r'C:\Users\Ravi
except ValueError:
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
# Skip rows with non-numeric values CA\heart.csv')

print(f"Skipping row with non-numeric # Standardize the data (remove the target
values: {row}") column if included)

return data data = standardize_data(data)

# Rest of the code remains the same # Step 1: Compute the covariance matrix

# Standardize the data (mean = 0, standard cov_matrix = compute_covariance_matrix(data)


deviation = 1)
print("Covariance Matrix:")
def standardize_data(data):
print(cov_matrix)
data = np.array(data)
# Step 2: Compute eigenvalues and eigenvectors
means = np.mean(data, axis=0)
eigenvalues, eigenvectors =
std_devs = np.std(data, axis=0) compute_eigen(cov_matrix)

standardized_data = (data - means) / print("\nEigenvalues:")


std_devs
print(eigenvalues)
return standardized_data
print("\nEigenvectors (Principal Components):")

print(eigenvectors)
# Step 3: Sort eigenvalues and select the top 2 0.04456768 0.29674147 -0.39558583 -
0.1497243 0.120113 -0.18165254
sorted_indices = np.argsort(eigenvalues)[::-1]
-0.16227112 0.43523468]
eigenvalues = eigenvalues[sorted_indices]
[ 0.28027591 -
eigenvectors = eigenvectors[:, sorted_indices]
0.0569568 0.0477654 1.00331126 0.123582
print("\nSorted Eigenvalues (Variance explained 07 0.17811839
by each component):")
-0.11448061 -
print(eigenvalues) 0.04685236 0.06784001 0.19385626 -
0.12187682 0.10172471

[-0.39984155 -0.04416567 0.29674147 -


# Step 4: Reduce to 2 principal components 0.04685236 -0.00997275 -0.00859548
projected_data = project_data(data, 0.04426955 1.00331126 -0.38006644 -
eigenvectors, 2) 0.34532664 0.38806515 -0.21388281
# Print the transformed 2D data -0.09675847 0.42313743]
print("\nProjected Data onto 2 Principal
Components (PCA result):")
Eigenvalues:
print(projected_data)
[3.31240732 1.57737062 1.23610856
Output: 1.2106344 0.3691578 0.37310751
Covariance Matrix: 0.43145758 1.02541325 0.97338089
[[ 1.00331126 -0.09877258 - 0.53617184 0.86609311 0.77866275
0.06888034 0.28027591 0.2143855 0.121709 0.73160926 0.62478273]
33

-0.1165957 -
0.39984155 0.09712136 0.21070797 - Eigenvectors (Principal Components):
0.16937323 0.27724123

0.06822655 -0.2261852 ]
Sorted Eigenvalues (Variance explained by each
[-0.09877258 1.00331126 -0.0495163 - component):
0.0569568 -0.19856751 0.0451809
[3.31240732 1.57737062 1.23610856
-0.05838897 - 1.2106344 1.02541325 0.97338089
0.04416567 0.1421329 0.09641107 -
0.86609311 0.77866275 0.73160926
0.03081226 0.11865301
0.62478273 0.53617184 0.43145758
0.2107366 -0.28186683]
0.37310751 0.3691578 ]
[-0.06888034 -
0.0495163 1.00331126 0.0477654 -
0.07715904 0.09475676
Projected Data onto 2 Principal Components PS C:\Users\Ravi
(PCA result): Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
CA>
[[-5.17391076e-02 2.62402204e+00]

[-8.17440594e-01 -7.30374576e-01]
import matplotlib.pyplot as plt
[-2.05759932e+00 -3.90984463e-02]
# Plot the PCA result (first 2 principal
[-1.90304318e+00 -5.96700877e-01]
components)
[-7.68370620e-01 4.12545496e-01]
plt.scatter(projected_data[:, 0],
[-1.00063986e+00 -2.33692414e-01] projected_data[:, 1], alpha=0.5)

[-5.91350471e-01 1.63794959e+00] plt.title('PCA Projection (2 Dimensions)')

[-2.00195499e+00 -1.32964375e+00] plt.xlabel('Principal Component 1')

[-1.23612315e+00 1.06805571e+00] plt.ylabel('Principal Component 2')

[-1.69086932e+00 1.84121933e-02] plt.grid(True)

[-1.10288020e+00 -4.16413123e-01] plt.show()

[-2.06867462e+00 6.26420387e-01]
4. Plot the whole input points as well as the corresponding computed principal components

in a figure. [Use matplot library and seaborn to showcase the curves]

standardized_data = (data - means) /


std_devs
import matplotlib.pyplot as plt
return standardized_data
import seaborn as sns
# Compute the covariance matrix
import numpy as np
def compute_covariance_matrix(data):
import csv
return np.cov(data.T)

# Compute eigenvalues and eigenvectors


# Load the CSV file manually
def compute_eigen(cov_matrix):
def load_csv(filename):
eigenvalues, eigenvectors =
data = []
np.linalg.eig(cov_matrix)
with open(filename, 'r') as file:
return eigenvalues, eigenvectors
reader = csv.reader(file)
# Project the data onto the top k eigenvectors
next(reader) # Skip the header
def project_data(data, eigenvectors, k):
for row in reader:
return np.dot(data, eigenvectors[:, :k])
try:
# Load the dataset (Replace with your actual
# Try converting each element to a CSV path)
float, and append if successful
data = load_csv(r'C:\Users\Ravi
data.append([float(x) for x in row]) Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
CA\heart.csv')
except ValueError:
# Standardize the data (remove the target
print(f"Skipping row with non-numeric column if included)
values: {row}")
data = standardize_data(data)
return data
# Step 1: Compute the covariance matrix
# Standardize the data (mean = 0, standard
deviation = 1) cov_matrix = compute_covariance_matrix(data)

def standardize_data(data):

data = np.array(data) # Step 2: Compute eigenvalues and eigenvectors

means = np.mean(data, axis=0) eigenvalues, eigenvectors =


compute_eigen(cov_matrix)
std_devs = np.std(data, axis=0)
# Step 3: Sort eigenvalues and select the top 2
sorted_indices = np.argsort(eigenvalues)[::-1] plt.subplot(1, 2, 1)

eigenvalues = eigenvalues[sorted_indices] plt.scatter(data[:, 0], data[:, 1], alpha=0.5,


color='blue')
eigenvectors = eigenvectors[:, sorted_indices]
plt.title('Original Data (First 2 Dimensions)')
# Step 4: Reduce to 2 principal components
plt.xlabel('Feature 1 (First Dimension)')
projected_data = project_data(data,
eigenvectors, 2) plt.ylabel('Feature 2 (Second Dimension)')

# Convert projected data into a format that can plt.grid(True)


be plotted
# Visualize the PCA result (projected data onto 2
projected_data_2D = np.array(projected_data) principal components)

# Visualizing the original data (first 2 plt.subplot(1, 2, 2)


dimensions of original dataset)
plt.scatter(projected_data_2D[:, 0],
plt.figure(figsize=(12, 6)) projected_data_2D[:, 1], alpha=0.5,
color='green')
# Plot the original input points (using first two
features from original data)

For dataset of Movies List

1. Find the number of different items or tuples per class.

import pandas as pd

# Load the dataset


df = pd.read_csv('movielist.csv')

# Assuming there is a class/target column that indicates classes

# Replace 'class_column' with the actual name of the class column

class_counts = df['Genre'].value_counts()

print("Number of items per class:")

print(class_counts)

Output:

2. Split the whole data and see if there are any obvious patterns (correlations between

different variables). Use scatter plot to see the distribution of data

import csv # Load the CSV file and filter only numeric
columns
import numpy as np
def load_csv(filename):
import matplotlib.pyplot as plt
data = []
import math
with open(filename, 'r') as file:

reader = csv.reader(file)
headers = next(reader) # Skip the header return math.sqrt(variance)

for row in reader: # Compute the correlation between two lists

numeric_row = [] def correlation(x, y):

for x in row: mu_x = mean(x)

try: mu_y = mean(y)

# Try converting each element to a std_x = std_dev(x)


float, and append if successful
std_y = std_dev(y)
numeric_row.append(float(x))
cov = sum((x[i] - mu_x) * (y[i] - mu_y) for i in
except ValueError: range(len(x))) / len(x)

# Skip non-numeric values return cov / (std_x * std_y)

continue # Load the dataset (numeric columns only)

if len(numeric_row) > 0: # Only append data = load_csv('movielist.csv')


if we found numeric values
if data.size == 0:
data.append(numeric_row)
print("No numeric data available for
# Find the maximum number of columns analysis.")
(some rows may have missing values)
else:
max_columns = max(len(row) for row in data)
# Compute the correlation matrix manually
# Filter out rows with missing numeric values
num_columns = data.shape[1]
(rows that don't match the max_columns
length) corr_matrix = [[correlation(data[:, i], data[:,
j]) for j in range(num_columns)] for i in
data = [row for row in data if len(row) ==
range(num_columns)]
max_columns]
print("Correlation Matrix:")
return np.array(data)
for row in corr_matrix:
# Compute the mean of a list
print(row)
def mean(data):
# Scatter plot function
return sum(data) / len(data)
def scatter_plot(x, y, x_label, y_label):
# Compute the standard deviation of a list
plt.scatter(x, y, alpha=0.5)
def std_dev(data):
plt.title(f'Scatter plot of {x_label} vs
mu = mean(data)
{y_label}')
variance = sum((x - mu) ** 2 for x in data) /
plt.xlabel(x_label)
len(data)
plt.ylabel(y_label) # Scatter plot between two numeric variables
(first two numeric columns)
plt.grid(True)
scatter_plot(data[:, 0], data[:, 1], 'Numeric
plt.show()
Feature 1', 'Numeric Feature 2')

Output:
3. Let’s shrink the whole dataset down to 2 dimensions (with as little loss as possible) with

the help of PCA.

Note that while computing the principal components, you have to find the Covariance

Matrix, unit eigenvectors and print them as well.

import csv headers = next(reader) # Skip the header

import numpy as np for row in reader:

import matplotlib.pyplot as plt numeric_row = []

import math for x in row:

# Load the CSV file and filter only numeric try:


columns
# Try converting each element to a
def load_csv(filename): float, and append if successful

data = [] numeric_row.append(float(x))

with open(filename, 'r') as file: except ValueError:

reader = csv.reader(file) # Skip non-numeric values


continue return cov / (std_x * std_y)

if len(numeric_row) > 0: # Only append # Load the dataset (numeric columns only)
if we found numeric values
data = load_csv('movielist.csv')
data.append(numeric_row)
if data.size == 0:
# Find the maximum number of columns
print("No numeric data available for
(some rows may have missing values)
analysis.")
max_columns = max(len(row) for row in data)
else:

# Compute the correlation matrix manually


# Filter out rows with missing numeric values
num_columns = data.shape[1]
(rows that don't match the max_columns
length) corr_matrix = [[correlation(data[:, i], data[:,
j]) for j in range(num_columns)] for i in
data = [row for row in data if len(row) ==
range(num_columns)]
max_columns]

return np.array(data)
print("Correlation Matrix:")
# Compute the mean of a list
for row in corr_matrix:
def mean(data):
print(row)
return sum(data) / len(data)
# Scatter plot function
# Compute the standard deviation of a list
def scatter_plot(x, y, x_label, y_label):
def std_dev(data):
plt.scatter(x, y, alpha=0.5)
mu = mean(data)
plt.title(f'Scatter plot of {x_label} vs
variance = sum((x - mu) ** 2 for x in data) /
{y_label}')
len(data)
plt.xlabel(x_label)
return math.sqrt(variance)
plt.ylabel(y_label)
# Compute the correlation between two lists
plt.grid(True)
def correlation(x, y):
plt.show()
mu_x = mean(x)
# Scatter plot between two numeric variables
mu_y = mean(y)
(first two numeric columns)
std_x = std_dev(x)
scatter_plot(data[:, 0], data[:, 1], 'Numeric
std_y = std_dev(y) Feature 1', 'Numeric Feature 2'

cov = sum((x[i] - mu_x) * (y[i] - mu_y) for i in # Standardize the data manually (mean = 0,
range(len(x))) / len(x) variance = 1)
def standardize_data(data): cov_matrix =
compute_covariance_matrix(data_standardized
means = np.mean(data, axis=0)
)
std_devs = np.std(data, axis=0)
print("Covariance Matrix:")
standardized_data = (data - means) /
print(cov_matrix)
std_devs
# Step 2: Compute eigenvalues and eigenvectors
return standardized_data
eigenvalues, eigenvectors =
# Compute the covariance matrix manually
compute_eigen(cov_matrix)
def compute_covariance_matrix(data):
print("\nEigenvalues:")
return np.cov(data.T)
print(eigenvalues)
# Compute eigenvalues and eigenvectors
print("\nEigenvectors (Principal Components):")
manually
print(eigenvectors)
def compute_eigen(cov_matrix):
# Step 3: Sort eigenvalues and select the top 2
eigenvalues, eigenvectors =
np.linalg.eig(cov_matrix) sorted_indices = np.argsort(eigenvalues)[::-1]

return eigenvalues, eigenvectors eigenvalues = eigenvalues[sorted_indices]

# Project the data onto the top k eigenvectors eigenvectors = eigenvectors[:, sorted_indices]
manually
print("\nSorted Eigenvalues (Variance explained
def project_data(data, eigenvectors, k): by each component):")

return np.dot(data, eigenvectors[:, :k]) print(eigenvalues)

# Step 4: Project the data onto the top 2


eigenvectors
# Load the dataset (Replace with your actual
CSV path) projected_data =
project_data(data_standardized, eigenvectors,
data = load_csv('movielist.csv')
2)
# Standardize the data
print("\nProjected Data onto 2 Principal
data_standardized = standardize_data(data) Components (PCA result):")

# Step 1: Compute the covariance matrix print(projected_data)

Output:
4. 4. Plot the whole input points as well as the corresponding computed principal components

in a figure. [Use matplot library and seaborn to showcase the curves]

import csv for x in row:

import numpy as np try:

import matplotlib.pyplot as plt # Try converting each element to a


float, and append if successful
import math
numeric_row.append(float(x))

except ValueError:
# Load the CSV file and filter only numeric
columns # Skip non-numeric values

def load_csv(filename): continue

data = [] if len(numeric_row) > 0: # Only append


if we found numeric values
with open(filename, 'r') as file:
data.append(numeric_row)
reader = csv.reader(file)

headers = next(reader) # Skip the header


# Find the maximum number of columns
for row in reader:
(some rows may have missing values)
numeric_row = []
max_columns = max(len(row) for row in data)
# Load the dataset (numeric columns only)

# Filter out rows with missing numeric values data = load_csv('movielist.csv')


(rows that don't match the max_columns
length)
if data.size == 0:
data = [row for row in data if len(row) ==
max_columns] print("No numeric data available for
analysis.")

else:
return np.array(data)
# Compute the correlation matrix manually

num_columns = data.shape[1]
# Compute the mean of a list
corr_matrix = [[correlation(data[:, i], data[:,
def mean(data):
j]) for j in range(num_columns)] for i in
return sum(data) / len(data) range(num_columns)]

# Compute the standard deviation of a list print("Correlation Matrix:")

def std_dev(data): for row in corr_matrix:

mu = mean(data) print(row)

variance = sum((x - mu) ** 2 for x in data) /


len(data)
# Scatter plot function
return math.sqrt(variance)
def scatter_plot(x, y, x_label, y_label):

plt.scatter(x, y, alpha=0.5)
# Compute the correlation between two lists
plt.title(f'Scatter plot of {x_label} vs
def correlation(x, y): {y_label}')

mu_x = mean(x) plt.xlabel(x_label)

mu_y = mean(y) plt.ylabel(y_label)

std_x = std_dev(x) plt.grid(True)

std_y = std_dev(y) plt.show()

cov = sum((x[i] - mu_x) * (y[i] - mu_y) for i in


range(len(x))) / len(x)
# Scatter plot between two numeric variables
return cov / (std_x * std_y) (first two numeric columns)

scatter_plot(data[:, 0], data[:, 1], 'Numeric


Feature 1', 'Numeric Feature 2')
data_standardized = standardize_data(data)

# Standardize the data manually (mean = 0,


variance = 1)
# Step 1: Compute the covariance matrix
def standardize_data(data):
cov_matrix =
means = np.mean(data, axis=0) compute_covariance_matrix(data_standardized
)
std_devs = np.std(data, axis=0)
print("Covariance Matrix:")
standardized_data = (data - means) /
std_devs print(cov_matrix)

return standardized_data

# Step 2: Compute eigenvalues and eigenvectors

# Compute the covariance matrix manually eigenvalues, eigenvectors =


compute_eigen(cov_matrix)
def compute_covariance_matrix(data):
print("\nEigenvalues:")
return np.cov(data.T)
print(eigenvalues)

print("\nEigenvectors (Principal Components):")


# Compute eigenvalues and eigenvectors
manually print(eigenvectors)

def compute_eigen(cov_matrix):

eigenvalues, eigenvectors = # Step 3: Sort eigenvalues and select the top 2


np.linalg.eig(cov_matrix)
sorted_indices = np.argsort(eigenvalues)[::-1]
return eigenvalues, eigenvectors
eigenvalues = eigenvalues[sorted_indices]

eigenvectors = eigenvectors[:, sorted_indices]


# Project the data onto the top k eigenvectors
manually
print("\nSorted Eigenvalues (Variance explained
def project_data(data, eigenvectors, k):
by each component):")
return np.dot(data, eigenvectors[:, :k])
print(eigenvalues)

# Load the dataset (Replace with your actual


# Step 4: Project the data onto the top 2
CSV path)
eigenvectors
data = load_csv('movielist.csv')
projected_data =
project_data(data_standardized, eigenvectors,
2)
# Standardize the data
plt.grid(True)

print("\nProjected Data onto 2 Principal


Components (PCA result):")
# PCA transformed data (first 2 principal
print(projected_data) components)

# Plot original data (first 2 dimensions) plt.subplot(1, 2, 2)

plt.figure(figsize=(12, 6)) plt.scatter(projected_data[:, 0],


projected_data[:, 1], alpha=0.5, color='green')

plt.title('PCA: Data Projected onto 2 Principal


# Original data plot (first 2 features)
Components')
plt.subplot(1, 2, 1)
plt.xlabel('Principal Component 1')
plt.scatter(data_standardized[:, 0],
plt.ylabel('Principal Component 2')
data_standardized[:, 1], alpha=0.5, color='blue')
plt.grid(True)
plt.title('Original Data (First 2 Dimensions)')

plt.xlabel('Feature 1')
plt.tight_layout()
plt.ylabel('Feature 2')
plt.show()

Output:
PCA 2
PCA on the Youth Smoking and Drug Dataset
1. Load the Youth Smoking and Drug Dataset, which contains various features relate to youth
smoking habits, substance use, and demographic information. Display the first few rows of the
dataset and summarize its key characteristics.

#1

import pandas as pd # Display the first few rows of the dataset

print("First few rows of the dataset:")

# Load the dataset print(youth_smoking_data.head())

file_path = 'youthdrug.csv' # Change this to


your actual file path
# Summary of the dataset
youth_smoking_data = pd.read_csv(file_path)
print("\nSummary of the dataset:")

print(f"Shape of the dataset: # Descriptive statistics for numerical columns


{youth_smoking_data.shape}") # Number of
print("\nDescriptive statistics:")
rows and columns
print(youth_smoking_data.describe())

# Get data types of the columns


# Check for missing values
print("\nData types of the columns:")
print("\nMissing values in each column:")
print(youth_smoking_data.dtypes)
print(youth_smoking_data.isnull().sum())

Output: Data types of the columns:

First few rows of the dataset: Year int64

Year Age_Group object


Age_Group Gender ... Substance_Education C
Gender object
ommunity_Support Media_Influence
Smoking_Prevalence float64
0 2024 15-
19 Both ... No 3 1 Drug_Experimentation float64
1 2024 10- Socioeconomic_Status object
14 Female ... Yes 9
3 Peer_Influence int64

2 2023 10- School_Programs object


14 Both ... No 5 1 Family_Background int64
3 2024 40- Mental_Health int64
49 Both ... No 10 9
Access_to_Counseling object
4 2023 15-
19 Male ... No 10 3 Parental_Supervision int64

Substance_Education object

[5 rows x 15 columns] Community_Support int64

Media_Influence int64

Summary of the dataset: dtype: object

Shape of the dataset: (10000, 15)


Descriptive statistics:
Year Smoking_Prevalence ... Commun Access_to_Counseling 0
ity_Support Media_Influence
Parental_Supervision 0
count 10000.000000 10000.000000 ...
Substance_Education 0
10000.000000 10000.000000
Community_Support 0
mean 2022.000500 27.439257 ... 5
.544600 5.506200 Media_Influence 0
std 1.425027 12.975528 ... 2.87 dtype: int64
0302 2.872836
PS C:\Users\Ravi
min 2020.000000 5.000000 ... 1.0 Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
00000 1.000000 CA DRUG>
25% 2021.000000 16.160000 ... 3. Substance_Education 0
000000 3.000000
Community_Support 0
50% 2022.000000 27.355000 ... 6.
000000 6.000000 Media_Influence 0

75% 2023.000000 38.672500 ... 8. dtype: int64


000000 8.000000 PS C:\Users\Ravi
max 2024.000000 50.000000 ... 10 Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
.000000 10.000000 CA DRUG

[8 rows x 9 columns] Substance_Education 0

Community_Support 0

Missing values in each column: Media_Influence 0

Year 0 dtype: int64

Age_Group 0 PS C:\Users\Ravi
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
Gender 0 CA DRUG>
Smoking_Prevalence 0 Substance_Education 0
Drug_Experimentation 0 Community_Support 0
Socioeconomic_Status 0 Media_Influence 0
Peer_Influence 0 dtype: int64
School_Programs 0 PS C:\Users\Ravi
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
Family_Background 0
CA DRUG>
Mental_Health 0
Substance_Education 0
Community_Support 0 import pandas as pd

Media_Influence 0

dtype: int64 # Load the dataset

PS C:\Users\Ravi file_path = 'youthdrug.csv' # Change this to


Nautiyal\OneDrive\Desktop\DVPDLABWORK\P your actual file path
CA DRUG>
youth_smoking_data = pd.read_csv(file_path)
dtype: int64

PS C:\Users\Ravi
# Check for missing values
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
CA DRUG> print("Missing values in each column:")

print(youth_smoking_data.isnull().sum())
2. Data Preprocessing: # Impute missing values with the mean for
numerical columns
o Check for missing values and handle them
appropriately. youth_smoking_data.fillna(youth_smoking_dat
a.mean(), inplace=True)
o Standardize the dataset (e.g., using z-score
normalization). Explain why standardization is

essential in the context of PCA. print("Missing values after imputation:")


#2 print(youth_smoking_data.isnull().sum())
# Standardize the numerical columns youth_smoking_data[column] =
(youth_smoking_data[column] - mean) /
for column in
std_dev
youth_smoking_data.select_dtypes(include=['fl
oat64', 'int64']).columns:

mean = youth_smoking_data[column].mean() print("\nFirst few rows after standardization:")

std_dev = youth_smoking_data[column].std() print(youth_smoking_data.head())

Output:

2. Applying PCA:

o Implement PCA on the standardized dataset. Determine the optimal number of principal

components to retain. Justify your choice based on the explained variance.

o Plot the cumulative explained variance. What threshold (e.g., 80% variance) do you consider

sufficient for retaining components?

import pandas as pd # Check for missing values

print("Missing values in each column:")

# Load the dataset # Standardize the numerical columns

file_path = 'youthdrug.csv' # Change this to for column in


your actual file path youth_smoking_data.select_dtypes(include=['fl
oat64', 'int64']).columns:
youth_smoking_data = pd.read_csv(file_path)
mean = youth_smoking_data[column].mean()
std_dev = youth_smoking_data[column].std()

youth_smoking_data[column] = # Fill missing values with the mean of each


(youth_smoking_data[column] - mean) / numeric column
std_dev
for column in
youth_smoking_data.select_dtypes(include=[n
p.number]).columns:
print("\nFirst few rows after standardization:")
youth_smoking_data[column].fillna(youth_s
print(youth_smoking_data.head())
moking_data[column].mean(), inplace=True)

# Drop rows with any infinite values from


numeric columns
import numpy as np
youth_smoking_data =
import matplotlib.pyplot as plt
youth_smoking_data[~np.isinf(youth_smoking_
from sklearn.preprocessing import data.select_dtypes(include=[np.number])).any(
StandardScaler axis=1)]

# Display the first few rows and summary of the # Check again for missing or infinite values after
dataset cleaning

print("First few rows of the dataset:") print("\nChecking for missing and infinite
values after cleaning:")
print(youth_smoking_data.head())
print(youth_smoking_data.isnull().sum())

print((~np.isfinite(youth_smoking_data.select_
print("\nSummary of the dataset:") dtypes(include=[np.number]))).sum())
print(youth_smoking_data.describe(include='all
'))
# Standardize the dataset (only the numeric
columns)
# Check for missing values in each column scaler = StandardScaler()
print("\nMissing values in each column:") youth_smoking_data_scaled =
print(youth_smoking_data.isnull().sum()) scaler.fit_transform(youth_smoking_data.selec
t_dtypes(include=[np.number]))

# Check for infinite values in each column


# Calculate the covariance matrix
print("\nInfinite values in each column:")
covariance_matrix =
print((~np.isfinite(youth_smoking_data.select_ np.cov(youth_smoking_data_scaled.T)
dtypes(include=[np.number]))).sum())
cumulative_variance =
np.cumsum(explained_variance)
# Check for NaNs or Infs in the covariance
matrix

if np.any(np.isnan(covariance_matrix)) or plt.figure(figsize=(10, 6))


np.any(np.isinf(covariance_matrix)):
plt.plot(cumulative_variance, marker='o')
print("Covariance matrix contains NaNs or
plt.title('Cumulative Explained Variance by
Infs!")
Principal Components')
else:
plt.xlabel('Number of Principal Components')
print("Covariance matrix is finite and valid.")
plt.ylabel('Cumulative Explained Variance')

plt.axhline(y=0.80, color='r', linestyle='--',


# Perform PCA label='80% Threshold')

eigenvalues, eigenvectors = plt.legend()


np.linalg.eig(covariance_matrix)
plt.grid()

plt.show()
# Display eigenvalues

print("\nEigenvalues:")
# Determine the optimal number of principal
print(eigenvalues) components

optimal_components =
np.argmax(cumulative_variance >= 0.80) + 1 #
# Plotting the cumulative explained variance
+1 because of zero indexing
explained_variance = eigenvalues /
print(f"\nOptimal number of principal
np.sum(eigenvalues)
components to retain for 80% variance:
{optimal_components}")
4. Create a scatter plot of the first two principal components. Color the points based on a

categorical variable, such as smoking status or drug use. Describe any patterns or clusters you

observe in the data.


There is a single cluster in the center of the graph

5. Examine the loadings of the principal components. Which original features have the highest influence
on the first few principal components? Discuss what this implies about youth

smoking and drug use behaviors.

# 5. # Display the first few rows and summary of the


dataset
import pandas as pd
print("First few rows of the dataset:")
import numpy as np
print(youth_smoking_data.head())
import matplotlib.pyplot as plt

import seaborn as sns


print("\nSummary of the dataset:")
from sklearn.preprocessing import
StandardScaler print(youth_smoking_data.describe(include='all
'))
from sklearn.decomposition import PCA

# Check for missing values in each column


# Load the dataset
print("\nMissing values in each column:")
file_path = 'youthdrug.csv' # Change this to
your actual file path print(youth_smoking_data.isnull().sum())

youth_smoking_data = pd.read_csv(file_path)

# Fill missing values with the mean of each


numeric column
for column in plt.title('Scatter Plot of First Two Principal
youth_smoking_data.select_dtypes(include=[n Components')
p.number]).columns:
plt.xlabel('Principal Component 1')
youth_smoking_data[column].fillna(youth_s
plt.ylabel('Principal Component 2')
moking_data[column].mean(), inplace=True)
plt.legend(title='Smoking Prevalence')

plt.grid()
# Standardize the dataset (only the numeric
columns) plt.show()
numeric_data =
youth_smoking_data.select_dtypes(include=[n
p.number]) # Examine the loadings of the principal
components
scaler = StandardScaler()
loadings = pca.components_.T *
standardized_data = np.sqrt(pca.explained_variance_)
scaler.fit_transform(numeric_data)

# Create a DataFrame for loadings


# Perform PCA
loadings_df = pd.DataFrame(loadings,
pca = PCA(n_components=2) index=numeric_data.columns, columns=['PC1',
'PC2'])
principal_components =
pca.fit_transform(standardized_data)

# Display loadings for the first two principal


components
# Create a DataFrame with the principal
components and the categorical variable print("\nLoadings for the first two principal
components:")
pc_df =
pd.DataFrame(data=principal_components, print(loadings_df)
columns=['PC1', 'PC2'])

pc_df['Smoking_Prevalence'] =
youth_smoking_data['Smoking_Prevalence'] # # Identify the features with the highest
Adjust this based on your categorical variable influence on the first two principal components

top_features_pc1 =
loadings_df['PC1'].abs().nlargest(5)
# Create the scatter plot
top_features_pc2 =
plt.figure(figsize=(10, 6)) loadings_df['PC2'].abs().nlargest(5)
sns.scatterplot(data=pc_df, x='PC1', y='PC2',
hue='Smoking_Prevalence', palette='viridis',
alpha=0.7) print("\nTop features for PC1:")
print(top_features_pc1) print(top_features_pc2)

print("\nTop features for PC2:")

Output:

Loadings for the first two principal components:

PC1 PC2

Year 0.264272 -0.594771

Smoking_Prevalence -0.044529 -0.301588

Drug_Experimentation -0.355048 -0.458871

Peer_Influence 0.470234 -0.220234

Family_Background 0.338889 -0.308383

Mental_Health -0.444141 -0.285692

Parental_Supervision 0.153690 -0.088601

Community_Support -0.498281 0.003799

Media_Influence -0.202244 -0.383311

Top features for PC1:

Community_Support 0.498281

Peer_Influence 0.470234

Mental_Health 0.444141

Drug_Experimentation 0.355048

Family_Background 0.338889

PS C:\Users\Ravi Nautiyal\OneDrive\Desktop\DVPDLABWORK\PCA DRUG>


6. Reconstruction:

o Attempt to reconstruct the original dataset from the retained principal components (e.g., the

first two components). Assess the quality of the reconstruction and discuss any significant

differences compared to the original data.

import pandas as pd scaler = StandardScaler()

import numpy as np standardized_data =


scaler.fit_transform(numeric_data)
import matplotlib.pyplot as plt

from sklearn.preprocessing import


StandardScaler # Perform PCA to retain only the first two
principal components
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

principal_components =
# Load the dataset
pca.fit_transform(standardized_data)
file_path = 'youthdrug.csv' # Change this to
your actual file path
# Reconstruct the data using the first two
youth_smoking_data = pd.read_csv(file_path)
principal components

reconstructed_data =
# Standardize the numeric columns pca.inverse_transform(principal_components)

numeric_data =
youth_smoking_data.select_dtypes(include=[n
# Convert the reconstructed data back to the
p.number])
original scale
reconstructed_data_original_scale =
scaler.inverse_transform(reconstructed_data)
# Visualize the difference between original and
reconstructed data for a few features

# Convert to DataFrame for easy comparison plt.figure(figsize=(12, 6))

reconstructed_df = for i, column in


pd.DataFrame(reconstructed_data_original_sca enumerate(numeric_data.columns[:5], 1): #
le, columns=numeric_data.columns) Compare first 5 features for visualization

original_df = plt.subplot(2, 3, i)
pd.DataFrame(scaler.inverse_transform(standa
plt.plot(original_df[column], label='Original',
rdized_data), columns=numeric_data.columns)
alpha=0.7)

plt.plot(reconstructed_df[column],
# Calculate reconstruction error (mean squared label='Reconstructed', alpha=0.7)
error for each feature)
plt.title(column)
reconstruction_error = np.mean((original_df -
plt.legend()
reconstructed_df) ** 2)
plt.tight_layout()
print("\nReconstruction error for each
feature:") plt.show()
print(reconstruction_error)

Output:
7. 7. Interpretation and Insights:

o Based on your PCA results, summarize key insights about the relationships between smoking

and drug use among youth. What factors seem to be most influential?

o Experiment with different numbers of principal components and observe how the patterns

change in your visualizations.

o Discuss the potential implications of your findings for public health initiatives aimed at

reducing youth smoking and substance use.


Flower Dataset

1. Dataset Preparation: You are given a dataset containing measurements of various

features of different types of flowers (e.g., sepal length, sepal width, petal length, petal

width). Load the dataset and display its first few rows.
2. Standardization: Before applying PCA, standardize the dataset. Explain why

standardization is necessary in PCA.

Code:
import csv return means, stds

def load_dataset(file_path): def standardize_dataset(dataset, means, stds):

dataset = [] standardized_data = []

with open(file_path, mode='r') as file: for row in dataset:

reader = csv.reader(file) standardized_row = [(row[i] - means[i]) /


stds[i] for i in range(len(row))]
header = next(reader) # Skip header if
there is one standardized_data.append(standardized_r
ow)
for row in reader:
return standardized_data
dataset.append([float(value) for value in
row[:-1]]) # Assuming last column is the label
(species)
# Load your flower dataset
return dataset
file_path = 'IRIS.csv' # Path to your provided
dataset file

def calculate_mean_std(dataset): dataset = load_dataset(file_path)

means = []

stds = [] # Calculate means and standard deviations for


each feature (sepal length, sepal width, petal
num_features = len(dataset[0])
length, petal width)

means, stds = calculate_mean_std(dataset)


# Calculate mean and standard deviation for
each feature
# Standardize the dataset
for i in range(num_features):
standardized_data =
feature_values = [row[i] for row in dataset]
standardize_dataset(dataset, means, stds)
mean = sum(feature_values) /
len(feature_values)
# Display the first few rows of the standardized
std = (sum((x - mean) ** 2 for x in
dataset
feature_values) / len(feature_values)) ** 0.5
for row in standardized_data[:]:
means.append(mean)
print(row)
stds.append(std)
Output:

..This is done for the whole data.

3. PCA Implementation:

o Apply PCA to the standardized dataset. How many principal components do

you choose, and why?

o Plot the explained variance ratio for each principal component. What does this
plot tell you about the importance of each component?

import csv

import numpy as np # Compute the covariance matrix

import matplotlib.pyplot as plt covariance_matrix = np.cov(data_standardized,


rowvar=False)

# Load Dataset (Assuming it's already in CSV


format with headers) # Calculate eigenvalues and eigenvectors

def load_dataset(file_path): eigenvalues, eigenvectors =


np.linalg.eigh(covariance_matrix)
dataset = []

labels = []
# Sort the eigenvalues and their corresponding
with open(file_path, mode='r') as file:
eigenvectors in descending order
reader = csv.reader(file)
sorted_indices = np.argsort(eigenvalues)[::-1]
header = next(reader) # Read header if
sorted_eigenvalues =
present
eigenvalues[sorted_indices]
for row in reader:
sorted_eigenvectors = eigenvectors[:,
dataset.append([float(value) for value in sorted_indices]
row[:-1]]) # Convert all but the last column to
float
# Explained variance ratio
labels.append(row[-1]) # Store the last
column as label explained_variance_ratio = sorted_eigenvalues
/ np.sum(sorted_eigenvalues)
return np.array(dataset), labels

print("Explained Variance Ratio for Each


# Load the dataset
Principal Component:")
file_path = 'IRIS.csv' # Replace with the actual
for i, var_ratio in
path to your dataset
enumerate(explained_variance_ratio):
data, labels = load_dataset(file_path)
print(f"Principal Component {i + 1}:
{var_ratio:.4f}")

# Standardize the dataset # Plot the explained variance ratio

mean = np.mean(data, axis=0) plt.figure(figsize=(8, 5))

std_dev = np.std(data, axis=0) plt.bar(range(1, len(explained_variance_ratio) +


1), explained_variance_ratio, alpha=0.7,
data_standardized = (data - mean) / std_dev color='blue')
plt.xlabel('Principal Components') num_components =
np.argmax(cumulative_explained_variance >=
plt.ylabel('Explained Variance Ratio')
0.95) + 1
plt.title('Explained Variance Ratio by Principal
print(f"Number of components chosen to retain
Component')
95% variance: {num_components}")
plt.show()
# Applying PCA with the chosen number of
# Choosing the number of components based on components
explained variance
data_pca_final = np.dot(data_standardized,
# For example, if we want to retain 95% of the sorted_eigenvectors[:, :num_components])
variance:
# Optional: Display the transformed data (first
cumulative_explained_variance = few rows)
np.cumsum(explained_variance_ratio)
print("Transformed Data (first few rows):")

print(data_pca_final[:5])
4. Visualization: Create a scatter plot of the first two principal components. How do the

different flower species cluster in this plot?

import csv

import numpy as np # Compute the covariance matrix

import matplotlib.pyplot as plt covariance_matrix = np.cov(data_standardized,


rowvar=False)

# Load Dataset (Assuming it's already in CSV


format with headers) # Calculate eigenvalues and eigenvectors

def load_dataset(file_path): eigenvalues, eigenvectors =


np.linalg.eigh(covariance_matrix)
dataset = []

labels = []
# Sort the eigenvalues and their corresponding
with open(file_path, mode='r') as file:
eigenvectors in descending order
reader = csv.reader(file)
sorted_indices = np.argsort(eigenvalues)[::-1]
header = next(reader) # Read header if
sorted_eigenvalues =
present
eigenvalues[sorted_indices]
for row in reader:
sorted_eigenvectors = eigenvectors[:,
dataset.append([float(value) for value in sorted_indices]
row[:-1]]) # Convert all but the last column to
float
# Explained variance ratio
labels.append(row[-1]) # Store the last
column as label explained_variance_ratio = sorted_eigenvalues
/ np.sum(sorted_eigenvalues)
return np.array(dataset), labels

# Load the dataset


# Choosing the number of components to retain
file_path = 'IRIS.csv' # Replace with the actual
(first 2 components for the scatter plot)
path to your dataset
num_components = 2
data, labels = load_dataset(file_path)
data_pca_final = np.dot(data_standardized,
sorted_eigenvectors[:, :num_components])
# Standardize the dataset

mean = np.mean(data, axis=0)


# Encode labels into numeric values
std_dev = np.std(data, axis=0)
unique_labels = list(set(labels))
data_standardized = (data - mean) / std_dev
label_to_color = {label: idx for idx, label in handles = [plt.Line2D([0], [0], marker='o',
enumerate(unique_labels)} color='w', markerfacecolor=cmap(i),
markersize=10) for i in
colors = [label_to_color[label] for label in
range(len(unique_labels))]
labels]

# Show the legend with labels


# Plotting the first two principal components
plt.legend(handles, unique_labels, title='Flower
plt.figure(figsize=(10, 7))
Species')
scatter = plt.scatter(data_pca_final[:, 0],
data_pca_final[:, 1], c=colors, cmap='viridis',
alpha=0.7) # Adding titles and labels

plt.title('PCA of Flower Dataset')

# Create a color map for the legend plt.xlabel('Principal Component 1')

cmap = plt.get_cmap('viridis', plt.ylabel('Principal Component 2')


len(unique_labels))
plt.grid()

plt.show()
# Create handles for the legend
5. Reconstruction: Attempt to reconstruct the original dataset from the first two

principal components. How does the reconstructed data compare to the original data?

import csv

import numpy as np

import matplotlib.pyplot as plt

# Load Dataset (Assuming it's already in CSV format with headers)

def load_dataset(file_path):

dataset = []

labels = []

with open(file_path, mode='r') as file:


reader = csv.reader(file)

header = next(reader) # Read header if present

for row in reader:

dataset.append([float(value) for value in row[:-1]]) # Convert all but the last column to float

labels.append(row[-1]) # Store the last column as label

return np.array(dataset), labels

# Load the dataset

file_path = 'IRIS.csv' # Replace with the actual path to your dataset

data, labels = load_dataset(file_path)

# Standardize the dataset

mean = np.mean(data, axis=0)

std_dev = np.std(data, axis=0)

data_standardized = (data - mean) / std_dev

# Compute the covariance matrix

covariance_matrix = np.cov(data_standardized, rowvar=False)

# Calculate eigenvalues and eigenvectors

eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)

# Sort the eigenvalues and their corresponding eigenvectors in descending order

sorted_indices = np.argsort(eigenvalues)[::-1]

sorted_eigenvalues = eigenvalues[sorted_indices]

sorted_eigenvectors = eigenvectors[:, sorted_indices]

# Explained variance ratio

explained_variance_ratio = sorted_eigenvalues / np.sum(sorted_eigenvalues)


# Choosing the number of components to retain (first 2 components for the scatter plot)

num_components = 2

data_pca_final = np.dot(data_standardized, sorted_eigenvectors[:, :num_components])

# Encode labels into numeric values

unique_labels = list(set(labels))

label_to_color = {label: idx for idx, label in enumerate(unique_labels)}

colors = [label_to_color[label] for label in labels]

# Plotting the first two principal components

plt.figure(figsize=(10, 7))

scatter = plt.scatter(data_pca_final[:, 0], data_pca_final[:, 1], c=colors, cmap='viridis', alpha=0.7)

# Create a color map for the legend

cmap = plt.get_cmap('viridis', len(unique_labels))

# Create handles for the legend

handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=cmap(i), markersize=10) for i in


range(len(unique_labels))]

# Show the legend with labels

plt.legend(handles, unique_labels, title='Flower Species')

Reconstruction: The reconstruction is performed by taking the reduced data (the first two principal
components), multiplying it by the transpose of the corresponding eigenvectors, scaling it back up by
the original standard deviations, and finally adding the mean.

Comparison: The original and reconstructed data are compared side by side for the first five samples
to visualize how well the PCA has captured the original dat
Linear Regression
1. Consider the following simple dataset containing two sets of variables: dependent

variable (y) and independent variable (x) for performing prediction based on it.

Basically you have to estimate the best fit line according to the given data.

Program : import matplotlib.pyplot as plt

import numpy as np

import pandas as pd

# Step 1: Read CSV data mse = np.mean((y - y_pred) **

data = pd.read_csv('LinearRegression.csv') #
Replace 'data.csv' with the path to your file
def gradient_descent(x, y, m, c, learning_rate,
x = data['X'].values iterations):

y = data['Y'].values n = len(y)

x = (x- np.mean(x)) / np.std(x) mse_history = []

y = (y - np.mean(y)) / np.std(y)

# Step 2: Visualize the data points for i in range(iterations):

plt.scatter(x, y, color='blue') y_pred = m * x + c

plt.xlabel('X (Independent variable)') d_m = (-2/n) * np.sum(x * (y - y_pred))

plt.ylabel('Y (Dependent variable)') d_c = (-2/n) * np.sum(y - y_pred)

plt.title('Data Points')

plt.show() # Update m and c

m -= learning_rate * d_m

# Step 3 and 4: Define MSE loss and gradient c -= learning_rate * d_c


descent optimizer

def compute_mse(x, y, m, c):


# Calculate MSE and store it
y_pred = m * x + c
mse = compute_mse(x, y, m, c)

mse_history.append(mse)

# Step 5: Training parameters

return m, c, mse_history learning_rate = 0.001

iterations = 1000
m, c = 0, 0 plt.legend()

plt.show()

# Train the model

m, c, mse_history = gradient_descent(x, y, m, c, # Step 7: Plot MSE over iterations


learning_rate, iterations)
plt.plot(range(iterations), mse_history,
color='green')

# Step 6: Plot data points and the best-fit line plt.xlabel('Iteration')

plt.scatter(x, y, color='blue', label='Data Points') plt.ylabel('Mean Squared Error')

plt.plot(x, m * x + c, color='red', label='Best Fit plt.title('MSE over Iterations')


Line')
plt.show()
plt.xlabel('X (Independent variable)')

plt.ylabel('Y (Dependent variable)')


print(f"Final slope (m): {m}")
plt.title('Best Fit Line using Gradient Descent')
print(f"Final intercept (c): {c}")
2. Implementation of PCA and Linear Regression

Consider the house-price dataset, and consider at least five sets of important featuresets.

Perform dimension reduction through PCA and then apply linear regression to estimate

the price of the house through linear regression.


import csv headers, rows = load_csv('./Housing.csv')

import math

import numpy as np # Step 2: Extract features and target variable

X = [row[:-1] for row in rows] # Features

# Step 1: Load the dataset manually y = [row[0] for row in rows] # Target (house
price)
def load_csv(filename):

with open(filename, 'r') as file:


# Step 3: Standardize features
reader = csv.reader(file)
def standardize_matrix(matrix):
data = list(reader)
means = [sum(column) / len(column) for
column in zip(*matrix)]
headers = data[0]
stds = [math.sqrt(sum((x - mean) ** 2 for x in
rows = data[1:] column) / len(column)) for column, mean in
zip(zip(*matrix), means)]

standardized = [[(x - mean) / std if std != 0


# Handle non-numeric data (categorical else 0 for x, mean, std in zip(row, means, stds)]
features) for row in matrix]
for i in range(len(rows[0])): return standardized, means, stds
try:

for row in rows: X_standardized, X_means, X_stds =


row[i] = float(row[i]) standardize_matrix(X)

except ValueError: print(f"Standardized


Features:\n{X_standardized[:5]}") # Print first 5
unique_values = list(set(row[i] for row in rows for brevity
rows))
print(f"Means of Features: {X_means}")
value_map = {val: idx for idx, val in
enumerate(unique_values)} print(f"Standard Deviations of Features:
{X_stds}")
for row in rows:

row[i] = value_map[row[i]]
# Standardize target variable (house prices)

y_mean = sum(y) / len(y)


return headers, rows
y_std = math.sqrt(sum((yi - y_mean) ** 2 for yi
in y) / len(y))
# Load dataset
y_standardized = [(yi - y_mean) / y_std for yi in eigenvalue = sum(sum(A[i][j] * b_k[j] for j
y] in range(n)) * b_k[i] for i in range(n))

print(f"Standardized Target return eigenvalue, b_k


Prices:\n{y_standardized[:5]}") # First 5
standardized target values
eigenvalues, eigenvectors = [], []
print(f"Mean of Target: {y_mean}")
for _ in range(len(matrix)):
print(f"Standard Deviation of Target: {y_std}")
eigenvalue, eigenvector =
power_iteration(matrix)
# Step 4: Calculate covariance matrix
eigenvalues.append(eigenvalue)
def covariance_matrix(matrix):
eigenvectors.append(eigenvector)
n = len(matrix)
matrix = [[matrix[i][j] - eigenvalue *
return [[sum(matrix[i][j] * matrix[i][k] for i in eigenvector[i] * eigenvector[j] for j in
range(n)) / (n - 1) for k in range(len(matrix[0]))] range(len(matrix))] for i in range(len(matrix))]
for j in range(len(matrix[0]))]

return eigenvalues, eigenvectors


cov_matrix =
covariance_matrix(X_standardized)
eigenvalues, eigenvectors =
print(f"Covariance Matrix (first
eigen_decomposition(cov_matrix)
5x5):\n{np.array(cov_matrix)[:5, :5]}") #
Display top left corner of the covariance matrix print(f"Eigenvalues: {eigenvalues[:5]}") #
Display first 5 eigenvalues

print(f"Eigenvectors (first 5):


# Step 5: Perform Eigen decomposition (PCA)
{eigenvectors[:5]}") # Display first 5
def eigen_decomposition(matrix): eigenvectors

def power_iteration(A,
num_simulations=1000):
# Step 6: Select top k components (PCA)
n = len(A)
k=5
b_k = [1] * n
sorted_indices =
for _ in range(num_simulations): sorted(range(len(eigenvalues)), key=lambda i:
eigenvalues[i], reverse=True)[:k]
b_k1 = [sum(A[i][j] * b_k[j] for j in
range(n)) for i in range(n)] top_eigenvectors = [eigenvectors[i] for i in
sorted_indices]
b_k1_norm = math.sqrt(sum(x ** 2 for x
in b_k1)) print(f"Selected Top {k} Eigenvectors:
{top_eigenvectors}")
b_k = [x / b_k1_norm for x in b_k1]
return np.linalg.solve(X_transpose_X,
X_transpose_y) # Solve using numpy
# Step 7: Project data onto top k components

X_pca = [[sum(x[j] * top_eigenvectors[i][j] for j


in range(len(x))) for i in range(k)] for x in coefficients = linear_regression(X_train,
X_standardized] y_train)

print(f"PCA Transformed Data (first 5 print(f"Linear Regression Coefficients:


rows):\n{X_pca[:5]}") {coefficients}")

# Step 8: Train-test split # Predict and transform back to original scale

def train_test_split(X, y, test_size=0.2): def predict(X, coefficients):

split_index = int(len(X) * (1 - test_size)) return np.dot(X, coefficients)

return X[:split_index], X[split_index:],


y[:split_index], y[split_index:]
y_pred_standardized = predict(X_test,
coefficients)

X_train, X_test, y_train, y_test = print(f"Predicted Standardized Values:


train_test_split(X_pca, y_standardized) {y_pred_standardized[:5]}") # Display first 5
predictions
print(f"Training Data (X_train): {X_train[:5]}")

print(f"Testing Data (X_test): {X_test[:5]}")


# Back-transform predictions to original scale

y_pred_actual = (y_pred_standardized * y_std)


# Step 9: Linear Regression
+ y_mean
def linear_regression(X, y):
print(f"Predicted House Prices (Original scale):
X_transpose = [[X[i][j] for i in range(len(X))] {y_pred_actual[:5]}") # Display first 5 back-
for j in range(len(X[0]))] transformed predictions

X_transpose_X = [[sum(X_transpose[i][k] * # Results


X[k][j] for k in range(len(X))) for j in
print("Predicted House Prices (Original scale):")
range(len(X[0]))] for i in range(len(X[0]))]
for i, price in enumerate(y_pred_actual[:10]):
X_transpose_y = [sum(X_transpose[i][k] *
y[k] for k in range(len(X))) for i in print(f"House {i + 1}: {price:.2f}"
range(len(X[0]))]

Output: [[4.566365125868964, 1.0467262882352135,


1.4034193573588287, 1.4218117367065388,
Standardized Features:
1.3782169202337782, 0.4056228722963174, -
6. Interpretation: Based on your PCA results, discuss which features contribute most to

the variance in the dataset.

import csv labels.append(row[-1]) # Store the last


column as label
import numpy as np
return np.array(dataset), labels
import matplotlib.pyplot as plt

import pandas as pd
# Load the dataset
# Load Dataset (Assuming it's already in CSV
format with headers) file_path = 'IRIS.csv' # Replace with the actual
path to your dataset
def load_dataset(file_path):
data, labels = load_dataset(file_path)
dataset = []

labels = []
# Standardize the dataset
with open(file_path, mode='r') as file:
mean = np.mean(data, axis=0)
reader = csv.reader(file)
std_dev = np.std(data, axis=0)
header = next(reader) # Read header if
present data_standardized = (data - mean) / std_dev

for row in reader:

dataset.append([float(value) for value in # Compute the covariance matrix


row[:-1]]) # Convert all but the last column to
covariance_matrix = np.cov(data_standardized,
float
rowvar=False)
# Create a color map for the legend

# Calculate eigenvalues and eigenvectors cmap = plt.get_cmap('viridis',


len(unique_labels))
eigenvalues, eigenvectors =
np.linalg.eigh(covariance_matrix) # Create handles for the legend

handles = [plt.Line2D([0], [0], marker='o',


color='w', markerfacecolor=cmap(i),
# Sort the eigenvalues and their corresponding
markersize=10) for i in
eigenvectors in descending order
range(len(unique_labels))]
sorted_indices = np.argsort(eigenvalues)[::-1]
# Show the legend with labels
sorted_eigenvalues =
plt.legend(handles, unique_labels, title='Flower
eigenvalues[sorted_indices]
Species')
sorted_eigenvectors = eigenvectors[:,
# Adding titles and labels
sorted_indices]
plt.title('PCA of Flower Dataset')

plt.xlabel('Principal Component 1')


# Explained variance ratio
plt.ylabel('Principal Component 2')
explained_variance_ratio = sorted_eigenvalues
/ np.sum(sorted_eigenvalues) plt.grid()

plt.show()

# Choosing the number of components to retain # Reconstruction from the first two principal
(first 2 components for the scatter plot) components

num_components = 2 # Reverse the PCA transformation

data_pca_final = np.dot(data_standardized, data_reconstructed = np.dot(data_pca_final,


sorted_eigenvectors[:, :num_components]) sorted_eigenvectors[:, :num_components].T) *
std_dev + mean
# Encode labels into numeric values
# Compare the reconstructed data with the
unique_labels = list(set(labels))
original data
label_to_color = {label: idx for idx, label in
# Let's look at the first 5 samples for
enumerate(unique_labels)}
comparison
colors = [label_to_color[label] for label in
comparison = np.hstack((data[:5],
labels]
data_reconstructed[:5]))
# Plotting the first two principal components

plt.figure(figsize=(10, 7))
# Display the original and reconstructed data
scatter = plt.scatter(data_pca_final[:, 0],
print("Original vs Reconstructed Data (First 5
data_pca_final[:, 1], c=colors, cmap='viridis',
Samples):")
alpha=0.7)
print(comparison) loadings_df = pd.DataFrame(loadings,
columns=[f'PC{i+1}' for i in
# Calculate and display component loadings
range(num_components)], index=['Sepal
loadings = sorted_eigenvectors[:, Length', 'Sepal Width', 'Petal Length', 'Petal
:num_components] * Width'])
np.sqrt(sorted_eigenvalues[:num_components]
print("\nComponent Loadings:")
)
print(loadings_df)

So petal length contributes the most to the variance in the dataset

7.Explore how the results change if you decide to keep more principal components (e.g.,

three or four). What insights can you gain?

You might also like