DVPD Final Lab Word PDF
DVPD Final Lab Word PDF
(CSDC – 023)
3rd Semester Computer Science and Engineering
Prepared during (Jun- Dec 2024) By
Name: Ravi Nautiyal
Roll No:23103117
Submitted to : Mr. Vishal Kumar Verma
Bar chart
import matplotlib.pyplot as plt data=[[1, 25, 30, 50, 1], [20, 1, 60, 80, 30], [30,
60, 1, 5, 20]]
# reading the database
fig = px.imshow(data,
data = pd.read_csv("tips.csv")
labels=dict(x="Day of Week", y="Time
# Bar chart with day against tip
of Day", color="Productivity"),
plt.bar(data['day'], data['tip'])
x=['Monday', 'Tuesday', 'Wednesday',
plt.title("Bar Chart") 'Thursday', 'Friday'],
plt.xlabel('Day') )
plt.ylabel('Tip') fig.update_xaxes(side="top")
plt.show()
Line Chart
# importing packages
//Heat map import seaborn as sns
import matplotlib.pyplot as plt data = pd.read_csv("tips.csv")
Histogram
# histogram of total_bills
plt.ylabel('Tip') s=data['total_bill'])
plt.title("Histogram")
plt.show()
plt.colorbar()
import plotly.graph_objects as go
data['end'] = data['cumulative']
Scatterplot is used to define the relation between two units using the dots.
Heatmap: It is used to display the visual representation of data using the colors
3D: R gives us the option to display 3d charts
Line Chart.
Bar Chart:
Pie Chart
Histogram
Bubble Chart
Waterfall Chart
Sankey Diagram
> library(plotly)
The following object is masked from The following objects are masked from
‘package:stats’: ‘package:stats’:
2. Split the whole data and see if there are any obvious patterns (correlations between
# Load the CSV file manually # Ensure we only extract numeric values,
converting where possible
def load_csv(filename):
try:
data = []
data.append([float(x) for x in row]) # Extract individual columns (Adjust indices
based on actual dataset structure)
except ValueError:
age = [row[0] for row in data] # Assuming age
print(f"Skipping row with non-numeric
is in the 1st column
values: {row}")
sex = [row[1] for row in data] # Assuming sex is
return data
in the 2nd column
# Compute the mean of a list
trtbps = [row[3] for row in data] # Assuming
def mean(data): trtbps (blood pressure) is in the 4th column
print(row)
plt.xlabel(x_label)
3. Let’s shrink the whole dataset down to 2 dimensions (with as little loss as possible) with
Note that while computing the principal components, you have to find the Covariance
Matrix, unit eigenvectors and print them as well.
print(f"Skipping row with non-numeric # Standardize the data (remove the target
values: {row}") column if included)
# Rest of the code remains the same # Step 1: Compute the covariance matrix
print(eigenvectors)
# Step 3: Sort eigenvalues and select the top 2 0.04456768 0.29674147 -0.39558583 -
0.1497243 0.120113 -0.18165254
sorted_indices = np.argsort(eigenvalues)[::-1]
-0.16227112 0.43523468]
eigenvalues = eigenvalues[sorted_indices]
[ 0.28027591 -
eigenvectors = eigenvectors[:, sorted_indices]
0.0569568 0.0477654 1.00331126 0.123582
print("\nSorted Eigenvalues (Variance explained 07 0.17811839
by each component):")
-0.11448061 -
print(eigenvalues) 0.04685236 0.06784001 0.19385626 -
0.12187682 0.10172471
-0.1165957 -
0.39984155 0.09712136 0.21070797 - Eigenvectors (Principal Components):
0.16937323 0.27724123
0.06822655 -0.2261852 ]
Sorted Eigenvalues (Variance explained by each
[-0.09877258 1.00331126 -0.0495163 - component):
0.0569568 -0.19856751 0.0451809
[3.31240732 1.57737062 1.23610856
-0.05838897 - 1.2106344 1.02541325 0.97338089
0.04416567 0.1421329 0.09641107 -
0.86609311 0.77866275 0.73160926
0.03081226 0.11865301
0.62478273 0.53617184 0.43145758
0.2107366 -0.28186683]
0.37310751 0.3691578 ]
[-0.06888034 -
0.0495163 1.00331126 0.0477654 -
0.07715904 0.09475676
Projected Data onto 2 Principal Components PS C:\Users\Ravi
(PCA result): Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
CA>
[[-5.17391076e-02 2.62402204e+00]
[-8.17440594e-01 -7.30374576e-01]
import matplotlib.pyplot as plt
[-2.05759932e+00 -3.90984463e-02]
# Plot the PCA result (first 2 principal
[-1.90304318e+00 -5.96700877e-01]
components)
[-7.68370620e-01 4.12545496e-01]
plt.scatter(projected_data[:, 0],
[-1.00063986e+00 -2.33692414e-01] projected_data[:, 1], alpha=0.5)
[-2.06867462e+00 6.26420387e-01]
4. Plot the whole input points as well as the corresponding computed principal components
def standardize_data(data):
import pandas as pd
class_counts = df['Genre'].value_counts()
print(class_counts)
Output:
2. Split the whole data and see if there are any obvious patterns (correlations between
import csv # Load the CSV file and filter only numeric
columns
import numpy as np
def load_csv(filename):
import matplotlib.pyplot as plt
data = []
import math
with open(filename, 'r') as file:
reader = csv.reader(file)
headers = next(reader) # Skip the header return math.sqrt(variance)
Output:
3. Let’s shrink the whole dataset down to 2 dimensions (with as little loss as possible) with
Note that while computing the principal components, you have to find the Covariance
data = [] numeric_row.append(float(x))
if len(numeric_row) > 0: # Only append # Load the dataset (numeric columns only)
if we found numeric values
data = load_csv('movielist.csv')
data.append(numeric_row)
if data.size == 0:
# Find the maximum number of columns
print("No numeric data available for
(some rows may have missing values)
analysis.")
max_columns = max(len(row) for row in data)
else:
return np.array(data)
print("Correlation Matrix:")
# Compute the mean of a list
for row in corr_matrix:
def mean(data):
print(row)
return sum(data) / len(data)
# Scatter plot function
# Compute the standard deviation of a list
def scatter_plot(x, y, x_label, y_label):
def std_dev(data):
plt.scatter(x, y, alpha=0.5)
mu = mean(data)
plt.title(f'Scatter plot of {x_label} vs
variance = sum((x - mu) ** 2 for x in data) /
{y_label}')
len(data)
plt.xlabel(x_label)
return math.sqrt(variance)
plt.ylabel(y_label)
# Compute the correlation between two lists
plt.grid(True)
def correlation(x, y):
plt.show()
mu_x = mean(x)
# Scatter plot between two numeric variables
mu_y = mean(y)
(first two numeric columns)
std_x = std_dev(x)
scatter_plot(data[:, 0], data[:, 1], 'Numeric
std_y = std_dev(y) Feature 1', 'Numeric Feature 2'
cov = sum((x[i] - mu_x) * (y[i] - mu_y) for i in # Standardize the data manually (mean = 0,
range(len(x))) / len(x) variance = 1)
def standardize_data(data): cov_matrix =
compute_covariance_matrix(data_standardized
means = np.mean(data, axis=0)
)
std_devs = np.std(data, axis=0)
print("Covariance Matrix:")
standardized_data = (data - means) /
print(cov_matrix)
std_devs
# Step 2: Compute eigenvalues and eigenvectors
return standardized_data
eigenvalues, eigenvectors =
# Compute the covariance matrix manually
compute_eigen(cov_matrix)
def compute_covariance_matrix(data):
print("\nEigenvalues:")
return np.cov(data.T)
print(eigenvalues)
# Compute eigenvalues and eigenvectors
print("\nEigenvectors (Principal Components):")
manually
print(eigenvectors)
def compute_eigen(cov_matrix):
# Step 3: Sort eigenvalues and select the top 2
eigenvalues, eigenvectors =
np.linalg.eig(cov_matrix) sorted_indices = np.argsort(eigenvalues)[::-1]
# Project the data onto the top k eigenvectors eigenvectors = eigenvectors[:, sorted_indices]
manually
print("\nSorted Eigenvalues (Variance explained
def project_data(data, eigenvectors, k): by each component):")
Output:
4. 4. Plot the whole input points as well as the corresponding computed principal components
except ValueError:
# Load the CSV file and filter only numeric
columns # Skip non-numeric values
else:
return np.array(data)
# Compute the correlation matrix manually
num_columns = data.shape[1]
# Compute the mean of a list
corr_matrix = [[correlation(data[:, i], data[:,
def mean(data):
j]) for j in range(num_columns)] for i in
return sum(data) / len(data) range(num_columns)]
mu = mean(data) print(row)
plt.scatter(x, y, alpha=0.5)
# Compute the correlation between two lists
plt.title(f'Scatter plot of {x_label} vs
def correlation(x, y): {y_label}')
return standardized_data
def compute_eigen(cov_matrix):
plt.xlabel('Feature 1')
plt.tight_layout()
plt.ylabel('Feature 2')
plt.show()
Output:
PCA 2
PCA on the Youth Smoking and Drug Dataset
1. Load the Youth Smoking and Drug Dataset, which contains various features relate to youth
smoking habits, substance use, and demographic information. Display the first few rows of the
dataset and summarize its key characteristics.
#1
Substance_Education object
Media_Influence int64
Community_Support 0
Age_Group 0 PS C:\Users\Ravi
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
Gender 0 CA DRUG>
Smoking_Prevalence 0 Substance_Education 0
Drug_Experimentation 0 Community_Support 0
Socioeconomic_Status 0 Media_Influence 0
Peer_Influence 0 dtype: int64
School_Programs 0 PS C:\Users\Ravi
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
Family_Background 0
CA DRUG>
Mental_Health 0
Substance_Education 0
Community_Support 0 import pandas as pd
Media_Influence 0
PS C:\Users\Ravi
# Check for missing values
Nautiyal\OneDrive\Desktop\DVPDLABWORK\P
CA DRUG> print("Missing values in each column:")
print(youth_smoking_data.isnull().sum())
2. Data Preprocessing: # Impute missing values with the mean for
numerical columns
o Check for missing values and handle them
appropriately. youth_smoking_data.fillna(youth_smoking_dat
a.mean(), inplace=True)
o Standardize the dataset (e.g., using z-score
normalization). Explain why standardization is
Output:
2. Applying PCA:
o Implement PCA on the standardized dataset. Determine the optimal number of principal
o Plot the cumulative explained variance. What threshold (e.g., 80% variance) do you consider
# Display the first few rows and summary of the # Check again for missing or infinite values after
dataset cleaning
print("First few rows of the dataset:") print("\nChecking for missing and infinite
values after cleaning:")
print(youth_smoking_data.head())
print(youth_smoking_data.isnull().sum())
print((~np.isfinite(youth_smoking_data.select_
print("\nSummary of the dataset:") dtypes(include=[np.number]))).sum())
print(youth_smoking_data.describe(include='all
'))
# Standardize the dataset (only the numeric
columns)
# Check for missing values in each column scaler = StandardScaler()
print("\nMissing values in each column:") youth_smoking_data_scaled =
print(youth_smoking_data.isnull().sum()) scaler.fit_transform(youth_smoking_data.selec
t_dtypes(include=[np.number]))
plt.show()
# Display eigenvalues
print("\nEigenvalues:")
# Determine the optimal number of principal
print(eigenvalues) components
optimal_components =
np.argmax(cumulative_variance >= 0.80) + 1 #
# Plotting the cumulative explained variance
+1 because of zero indexing
explained_variance = eigenvalues /
print(f"\nOptimal number of principal
np.sum(eigenvalues)
components to retain for 80% variance:
{optimal_components}")
4. Create a scatter plot of the first two principal components. Color the points based on a
categorical variable, such as smoking status or drug use. Describe any patterns or clusters you
5. Examine the loadings of the principal components. Which original features have the highest influence
on the first few principal components? Discuss what this implies about youth
youth_smoking_data = pd.read_csv(file_path)
plt.grid()
# Standardize the dataset (only the numeric
columns) plt.show()
numeric_data =
youth_smoking_data.select_dtypes(include=[n
p.number]) # Examine the loadings of the principal
components
scaler = StandardScaler()
loadings = pca.components_.T *
standardized_data = np.sqrt(pca.explained_variance_)
scaler.fit_transform(numeric_data)
pc_df['Smoking_Prevalence'] =
youth_smoking_data['Smoking_Prevalence'] # # Identify the features with the highest
Adjust this based on your categorical variable influence on the first two principal components
top_features_pc1 =
loadings_df['PC1'].abs().nlargest(5)
# Create the scatter plot
top_features_pc2 =
plt.figure(figsize=(10, 6)) loadings_df['PC2'].abs().nlargest(5)
sns.scatterplot(data=pc_df, x='PC1', y='PC2',
hue='Smoking_Prevalence', palette='viridis',
alpha=0.7) print("\nTop features for PC1:")
print(top_features_pc1) print(top_features_pc2)
Output:
PC1 PC2
Community_Support 0.498281
Peer_Influence 0.470234
Mental_Health 0.444141
Drug_Experimentation 0.355048
Family_Background 0.338889
o Attempt to reconstruct the original dataset from the retained principal components (e.g., the
first two components). Assess the quality of the reconstruction and discuss any significant
principal_components =
# Load the dataset
pca.fit_transform(standardized_data)
file_path = 'youthdrug.csv' # Change this to
your actual file path
# Reconstruct the data using the first two
youth_smoking_data = pd.read_csv(file_path)
principal components
reconstructed_data =
# Standardize the numeric columns pca.inverse_transform(principal_components)
numeric_data =
youth_smoking_data.select_dtypes(include=[n
# Convert the reconstructed data back to the
p.number])
original scale
reconstructed_data_original_scale =
scaler.inverse_transform(reconstructed_data)
# Visualize the difference between original and
reconstructed data for a few features
original_df = plt.subplot(2, 3, i)
pd.DataFrame(scaler.inverse_transform(standa
plt.plot(original_df[column], label='Original',
rdized_data), columns=numeric_data.columns)
alpha=0.7)
plt.plot(reconstructed_df[column],
# Calculate reconstruction error (mean squared label='Reconstructed', alpha=0.7)
error for each feature)
plt.title(column)
reconstruction_error = np.mean((original_df -
plt.legend()
reconstructed_df) ** 2)
plt.tight_layout()
print("\nReconstruction error for each
feature:") plt.show()
print(reconstruction_error)
Output:
7. 7. Interpretation and Insights:
o Based on your PCA results, summarize key insights about the relationships between smoking
and drug use among youth. What factors seem to be most influential?
o Experiment with different numbers of principal components and observe how the patterns
o Discuss the potential implications of your findings for public health initiatives aimed at
features of different types of flowers (e.g., sepal length, sepal width, petal length, petal
width). Load the dataset and display its first few rows.
2. Standardization: Before applying PCA, standardize the dataset. Explain why
Code:
import csv return means, stds
dataset = [] standardized_data = []
means = []
3. PCA Implementation:
o Plot the explained variance ratio for each principal component. What does this
plot tell you about the importance of each component?
import csv
labels = []
# Sort the eigenvalues and their corresponding
with open(file_path, mode='r') as file:
eigenvectors in descending order
reader = csv.reader(file)
sorted_indices = np.argsort(eigenvalues)[::-1]
header = next(reader) # Read header if
sorted_eigenvalues =
present
eigenvalues[sorted_indices]
for row in reader:
sorted_eigenvectors = eigenvectors[:,
dataset.append([float(value) for value in sorted_indices]
row[:-1]]) # Convert all but the last column to
float
# Explained variance ratio
labels.append(row[-1]) # Store the last
column as label explained_variance_ratio = sorted_eigenvalues
/ np.sum(sorted_eigenvalues)
return np.array(dataset), labels
print(data_pca_final[:5])
4. Visualization: Create a scatter plot of the first two principal components. How do the
import csv
labels = []
# Sort the eigenvalues and their corresponding
with open(file_path, mode='r') as file:
eigenvectors in descending order
reader = csv.reader(file)
sorted_indices = np.argsort(eigenvalues)[::-1]
header = next(reader) # Read header if
sorted_eigenvalues =
present
eigenvalues[sorted_indices]
for row in reader:
sorted_eigenvectors = eigenvectors[:,
dataset.append([float(value) for value in sorted_indices]
row[:-1]]) # Convert all but the last column to
float
# Explained variance ratio
labels.append(row[-1]) # Store the last
column as label explained_variance_ratio = sorted_eigenvalues
/ np.sum(sorted_eigenvalues)
return np.array(dataset), labels
plt.show()
# Create handles for the legend
5. Reconstruction: Attempt to reconstruct the original dataset from the first two
principal components. How does the reconstructed data compare to the original data?
import csv
import numpy as np
def load_dataset(file_path):
dataset = []
labels = []
dataset.append([float(value) for value in row[:-1]]) # Convert all but the last column to float
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
num_components = 2
unique_labels = list(set(labels))
plt.figure(figsize=(10, 7))
Reconstruction: The reconstruction is performed by taking the reduced data (the first two principal
components), multiplying it by the transpose of the corresponding eigenvectors, scaling it back up by
the original standard deviations, and finally adding the mean.
Comparison: The original and reconstructed data are compared side by side for the first five samples
to visualize how well the PCA has captured the original dat
Linear Regression
1. Consider the following simple dataset containing two sets of variables: dependent
variable (y) and independent variable (x) for performing prediction based on it.
Basically you have to estimate the best fit line according to the given data.
import numpy as np
import pandas as pd
data = pd.read_csv('LinearRegression.csv') #
Replace 'data.csv' with the path to your file
def gradient_descent(x, y, m, c, learning_rate,
x = data['X'].values iterations):
y = data['Y'].values n = len(y)
y = (y - np.mean(y)) / np.std(y)
plt.title('Data Points')
m -= learning_rate * d_m
mse_history.append(mse)
iterations = 1000
m, c = 0, 0 plt.legend()
plt.show()
Consider the house-price dataset, and consider at least five sets of important featuresets.
Perform dimension reduction through PCA and then apply linear regression to estimate
import math
# Step 1: Load the dataset manually y = [row[0] for row in rows] # Target (house
price)
def load_csv(filename):
row[i] = value_map[row[i]]
# Standardize target variable (house prices)
def power_iteration(A,
num_simulations=1000):
# Step 6: Select top k components (PCA)
n = len(A)
k=5
b_k = [1] * n
sorted_indices =
for _ in range(num_simulations): sorted(range(len(eigenvalues)), key=lambda i:
eigenvalues[i], reverse=True)[:k]
b_k1 = [sum(A[i][j] * b_k[j] for j in
range(n)) for i in range(n)] top_eigenvectors = [eigenvectors[i] for i in
sorted_indices]
b_k1_norm = math.sqrt(sum(x ** 2 for x
in b_k1)) print(f"Selected Top {k} Eigenvectors:
{top_eigenvectors}")
b_k = [x / b_k1_norm for x in b_k1]
return np.linalg.solve(X_transpose_X,
X_transpose_y) # Solve using numpy
# Step 7: Project data onto top k components
import pandas as pd
# Load the dataset
# Load Dataset (Assuming it's already in CSV
format with headers) file_path = 'IRIS.csv' # Replace with the actual
path to your dataset
def load_dataset(file_path):
data, labels = load_dataset(file_path)
dataset = []
labels = []
# Standardize the dataset
with open(file_path, mode='r') as file:
mean = np.mean(data, axis=0)
reader = csv.reader(file)
std_dev = np.std(data, axis=0)
header = next(reader) # Read header if
present data_standardized = (data - mean) / std_dev
plt.show()
# Choosing the number of components to retain # Reconstruction from the first two principal
(first 2 components for the scatter plot) components
plt.figure(figsize=(10, 7))
# Display the original and reconstructed data
scatter = plt.scatter(data_pca_final[:, 0],
print("Original vs Reconstructed Data (First 5
data_pca_final[:, 1], c=colors, cmap='viridis',
Samples):")
alpha=0.7)
print(comparison) loadings_df = pd.DataFrame(loadings,
columns=[f'PC{i+1}' for i in
# Calculate and display component loadings
range(num_components)], index=['Sepal
loadings = sorted_eigenvectors[:, Length', 'Sepal Width', 'Petal Length', 'Petal
:num_components] * Width'])
np.sqrt(sorted_eigenvalues[:num_components]
print("\nComponent Loadings:")
)
print(loadings_df)
7.Explore how the results change if you decide to keep more principal components (e.g.,