BABU BANARSI DAS UNIVERSITY
LUCKNOW U.P.
Lab Manual
on
Data Science with Python
(BAI3552)
For
B.Tech3rdYear(Sem. 5th)
Session(2024-
25)
Session(2024-25)
B.Tech CS33 and CS34
Page 1 of 28
Babu Banarasi Das University
Subject:DS Lab(BAI3552) Program:B.Tech.CSAI III Year(Sem-V)
INDEX
Sr. No. Title Page No.
1. 3-4
Implementation of a program for reading of different types of data sets
(.txt, .csv) from web and disk and writing in file in specific disk location.
2. 5-6
Implementation of a program for reading of EXCEL and XML data sheets.
3. 7-9
Implementation of Basic Statistics functions and performs visualization.
4. 10-12
Implementation of K-means Clustering and K-nearest Neighbor algorithm.
5. 13-14
Implementation of Association Rules.
6. 15-17
Implementation of Linear Regression and Logistic Regression.
7. 18-19
Implementation of Naive Bayesian Classifier.
8. 20-22
Implementation of Decision Trees.
9. 23-24
Implementation of Random Forest.
10. 25-26
Implementation of Principal component analysis.
11. 27-28
Implementation of Singular Value Decomposition.
Page 2 of 28
Program No. : 1
Implementation of a program for reading of different types of data sets (.txt, .csv) from
web and disk and writing in file in specific disk location.
#Step 1: Import Necessary Libraries
import os
import requests
import pandas as pd
#Step 2: Define Functions to Handle Different File Types
def read_txt_from_disk(file_path):
with open(file_path, 'r') as file:
data = file.read()
return data
def read_csv_from_disk(file_path):
df = pd.read_csv(file_path)
return df
def read_txt_from_web(url):
response = requests.get(url)
response.raise_for_status() # Ensure we got a valid response
return response.text
def read_csv_from_web(url):
df = pd.read_csv(url)
return df
#Step 3: Write Data to a Specific Disk Location
def write_txt_to_disk(data, output_path):
with open(output_path, 'w') as file:
file.write(data)
def write_csv_to_disk(df, output_path):
df.to_csv(output_path, index=False)
#Step 4: Implement a Main Function to Combine All Operations
def process_file(file_type, source, destination):
# Read the data based on file type and source type
Page 3 of 28
if file_type == 'txt':
if source.startswith('http'):
data = read_txt_from_web(source)
else:
data = read_txt_from_disk(source)
# Write the data to the destination
write_txt_to_disk(data, destination)
elif file_type == 'csv':
if source.startswith('http'):
df = read_csv_from_web(source)
else:
df = read_csv_from_disk(source)
# Write the DataFrame to the destination
write_csv_to_disk(df, destination)
else:
print(f"Unsupported file type: {file_type}")
#Step 5: Example Usage
if __name__ == "__main__":
# Example for reading a .txt file from the web and saving it locally
txt_url = "https://example.com/sample.txt"
process_file('txt', txt_url, "output/sample.txt")
# Example for reading a .csv file from the disk and saving it to another location
csv_file_path = "data/sample.csv"
process_file('csv', csv_file_path, "output/sample_output.csv")
Page 4 of 28
Program No. : 2
Implementation of a program for reading of EXCEL and XML data sheets.
#Step 1: Import Necessary Libraries
import pandas as pd
import xml.etree.ElementTree as ET
#Step 2: Define Functions to Handle Different File Types
def read_excel_file(file_path):
df = pd.read_excel(file_path)
return df
#Function to Read an XML File
def read_xml_file(file_path):
tree = ET.parse(file_path)
root = tree.getroot()
return root
#Step 3: Write Data to a Specific Disk Location
def write_excel_to_disk(df, output_path):
df.to_excel(output_path, index=False)
#Function to Write XML Data to Disk
def write_xml_to_disk(root, output_path):
tree = ET.ElementTree(root)
tree.write(output_path, encoding="utf-8", xml_declaration=True)
#Step 4: Implement a Main Function to Combine All Operations
def process_file(file_type, source, destination):
# Read the data based on file type and source type
if file_type == 'excel':
df = read_excel_file(source)
# Write the DataFrame to the destination
write_excel_to_disk(df, destination)
elif file_type == 'xml':
root = read_xml_file(source)
# Write the XML tree to the destination
Page 5 of 28
write_xml_to_disk(root, destination)
else:
print(f"Unsupported file type: {file_type}")
#Step 5: Example Usage
if __name__ == "__main__":
# Example for reading an Excel file and saving it locally
excel_file_path = "data/sample.xlsx"
process_file('excel', excel_file_path, "output/sample_output.xlsx")
# Example for reading an XML file and saving it locally
xml_file_path = "data/sample.xml"
process_file('xml', xml_file_path, "output/sample_output.xml")
Page 6 of 28
Program No. : 3
Implementation of Basic Statistics functions and performs visualization.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
#Step 2: Implement Basic Statistics Functions
#1. Mean
def calculate_mean(data):
return np.mean(data)
2. Median
def calculate_median(data):
return np.median(data)
3. Mode
def calculate_mode(data):
mode, count = stats.mode(data)
return mode[0]
4. Standard Deviation
def calculate_std_dev(data):
return np.std(data)
5. Variance
def calculate_variance(data):
return np.var(data)
6. Percentiles
def calculate_percentile(data, percentile):
return np.percentile(data, percentile)
Step 3: Perform Data Visualization
1. Histogram
def plot_histogram(data, title='Histogram', xlabel='Values', ylabel='Frequency'):
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, color='skyblue', edgecolor='black')
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
Page 7 of 28
2. Box Plot
def plot_boxplot(data, title='Boxplot'):
plt.figure(figsize=(10, 6))
sns.boxplot(data)
plt.title(title)
plt.show()
3. Scatter Plot
def plot_scatter(x, y, title='Scatter Plot', xlabel='X', ylabel='Y'):
plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='purple')
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
4. Line Plot
def plot_line(x, y, title='Line Plot', xlabel='X', ylabel='Y'):
plt.figure(figsize=(10, 6))
plt.plot(x, y, color='green')
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
#Step 4: Example Usage
if __name__ == "__main__":
# Example data
data = np.random.normal(loc=0, scale=1, size=1000)
# Calculate basic statistics
mean = calculate_mean(data)
median = calculate_median(data)
mode = calculate_mode(data)
std_dev = calculate_std_dev(data)
variance = calculate_variance(data)
percentile_25 = calculate_percentile(data, 25)
percentile_75 = calculate_percentile(data, 75)
# Print calculated statistics
print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Standard Deviation: {std_dev}")
print(f"Variance: {variance}")
Page 8 of 28
print(f"25th Percentile: {percentile_25}")
print(f"75th Percentile: {percentile_75}")
# Visualize data
plot_histogram(data)
plot_boxplot(data)
# Example scatter plot with another dataset
x = np.random.normal(loc=0, scale=1, size=100)
y = x * 2 + np.random.normal(loc=0, scale=1, size=100)
plot_scatter(x, y)
# Example line plot
plot_line(x, y)
Page 9 of 28
Program No. : 4
Implementation of K-means Clustering and K-nearest Neighbor algorithm.
#Step 1: Import Necessary Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#Step 2: K-means Clustering Implementation
1. Generate Sample Data
def generate_data_for_clustering(n_samples=300, n_features=2, centers=4, cluster_std=1.0):
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers,
cluster_std=cluster_std, random_state=42)
return X, y
2. Implement K-means Clustering
def kmeans_clustering(X, n_clusters):
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)
return kmeans
3. Visualize K-means Clustering
def plot_kmeans_clusters(X, kmeans):
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_, cmap='viridis', marker='o')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', marker='x')
plt.title("K-means Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()
#Step 3: K-Nearest Neighbor (KNN) Implementation
def generate_data_for_knn(n_samples=300, n_features=2, centers=4, cluster_std=1.0):
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers,
cluster_std=cluster_std, random_state=42)
return X, y
1. Generate Sample Data for KNN
def generate_data_for_knn(n_samples=300, n_features=2, centers=4, cluster_std=1.0):
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers,
cluster_std=cluster_std, random_state=42)
Page 10 of 28
return X, y
2. Implement K-Nearest Neighbor Classifier
def knn_classifier(X_train, y_train, X_test, n_neighbors=5):
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
return y_pred, knn
3. Evaluate KNN Classifier
def evaluate_knn(y_test, y_pred):
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
return accuracy, cm, cr
#Step 4: Example Usage
1. K-means Clustering Example
if __name__ == "__main__":
# K-means Clustering Example
X, y = generate_data_for_clustering(n_samples=300, n_features=2, centers=4, cluster_std=1.0)
kmeans = kmeans_clustering(X, n_clusters=4)
plot_kmeans_clusters(X, kmeans)
2. K-Nearest Neighbor Example
if __name__ == "__main__":
# K-Nearest Neighbor Example
X, y = generate_data_for_knn(n_samples=300, n_features=2, centers=4, cluster_std=1.0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_pred, knn = knn_classifier(X_train, y_train, X_test, n_neighbors=5)
accuracy, cm, cr = evaluate_knn(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)
# Visualize the KNN decision boundary
plot_knn_decision_boundary(X_train, y_train, knn)
#Step 5: Visualize KNN Decision Boundary
def plot_knn_decision_boundary(X, y, knn, h=0.02):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
Page 11 of 28
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', marker='o', cmap='viridis')
plt.title("K-Nearest Neighbor Decision Boundary")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()
Page 12 of 28
Program No. : 5
Implementation of Association Rules.
#Step 1: Install Necessary Libraries
pip install mlxtend
#Step 2: Import Necessary Libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
#Step 3: Prepare the Dataset
def load_dataset():
data = {'Milk': [1, 0, 1, 1, 0],
'Bread': [1, 1, 0, 1, 1],
'Butter': [0, 1, 1, 1, 0],
'Beer': [1, 0, 0, 1, 1],
'Diapers': [0, 1, 1, 1, 0]}
df = pd.DataFrame(data)
return df
#Step 4: Generate Frequent Itemsets
def generate_frequent_itemsets(df, min_support=0.6):
frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
return frequent_itemsets
#Step 5: Generate Association Rules
def generate_association_rules(frequent_itemsets, metric="lift", min_threshold=1.0):
rules = association_rules(frequent_itemsets, metric=metric, min_threshold=min_threshold)
return rules
#Step 6: Example Usage
if __name__ == "__main__":
# Load dataset
df = load_dataset()
# Generate frequent itemsets with a minimum support of 0.6
frequent_itemsets = generate_frequent_itemsets(df, min_support=0.6)
print("Frequent Itemsets:")
print(frequent_itemsets)
# Generate association rules with a minimum lift of 1.0
rules = generate_association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
print("\nAssociation Rules:")
Page 13 of 28
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
Page 14 of 28
Program No. : 6
Implementation of Linear Regression and Logistic Regression.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix,
classification_report
import matplotlib.pyplot as plt
import seaborn as sns
#Step 2: Linear Regression Implementation
#1. Generate or Load Dataset
def generate_linear_data(n_samples=100, noise=10):
np.random.seed(42)
X = 2 * np.random.rand(n_samples, 1)
y = 4 + 3 * X + np.random.randn(n_samples, 1) * noise
return X, y
X, y = generate_linear_data()
#2. Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#3. Train the Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
#4. Predict and Evaluate the Model
y_pred = linear_model.predict(X_test)
# Calculate Mean Squared Error and R2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")
#5. Visualize the Results
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
Page 15 of 28
plt.title('Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
#Step 3: Logistic Regression Implementation
#1. Generate or Load Dataset
from sklearn.datasets import make_classification
def generate_logistic_data(n_samples=100, n_features=2, n_classes=2):
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
random_state=42)
return X, y
X, y = generate_logistic_data()
#2.Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#3.Train the Logistic Regression Model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
#4. Predict and Evaluate the Model
y_pred = logistic_model.predict(X_test)
# Calculate Accuracy and Confusion Matrix
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)
#5. Visualize the Decision Boundary
def plot_decision_boundary(X, y, model):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
Page 16 of 28
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', cmap='viridis')
plt.title("Logistic Regression Decision Boundary")
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
plot_decision_boundary(X_test, y_test, logistic_model)
Page 17 of 28
Program No. : 7
Implementation of Naive Bayesian Classifier.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
#Step 2: Generate or Load Dataset
from sklearn.datasets import make_classification
def generate_naive_bayes_data(n_samples=100, n_features=2, n_classes=2):
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
n_informative=2, random_state=42)
return X, y
X, y = generate_naive_bayes_data()
#Step 3: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#Step 4: Train the Naive Bayesian Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
#Step 5: Predict and Evaluate the Model
y_pred = nb_classifier.predict(X_test)
# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
# Generate Confusion Matrix and Classification Report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(cr)
Page 18 of 28
#Step 6: Visualize the Decision Boundary
def plot_decision_boundary(X, y, model):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', cmap='viridis')
plt.title("Naive Bayesian Classifier Decision Boundary")
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
plot_decision_boundary(X_test, y_test, nb_classifier)
Page 19 of 28
Program No. : 8
Implementation of Decision Trees.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,
mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
#Step 2: Generate or Load Dataset
#1. Classification Dataset
from sklearn.datasets import make_classification
def generate_classification_data(n_samples=100, n_features=4, n_classes=2):
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
random_state=42)
return X, y
X_class, y_class = generate_classification_data()
#2. Regression Dataset
from sklearn.datasets import make_regression
def generate_regression_data(n_samples=100, n_features=1, noise=0.1):
X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=noise,
random_state=42)
return X, y
X_reg, y_reg = generate_regression_data()
#Step 3: Split the Data
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3,
random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3,
random_state=42)
#Step 4: Train the Decision Tree Models
1. Decision Tree for Classification
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_class, y_train_class)
Page 20 of 28
2. Decision Tree for Regression
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_reg, y_train_reg)
#Step 5: Predict and Evaluate the Models
#1. Classification Model Evaluation
y_pred_class = dt_classifier.predict(X_test_class)
# Accuracy and Confusion Matrix
accuracy_class = accuracy_score(y_test_class, y_pred_class)
cm_class = confusion_matrix(y_test_class, y_pred_class)
cr_class = classification_report(y_test_class, y_pred_class)
print(f"Classification Accuracy: {accuracy_class}")
print("Confusion Matrix:")
print(cm_class)
print("Classification Report:")
print(cr_class)
#2. Regression Model Evaluation
y_pred_reg = dt_regressor.predict(X_test_reg)
# Mean Squared Error and R2 Score
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
r2_reg = r2_score(y_test_reg, y_pred_reg)
print(f"Regression Mean Squared Error: {mse_reg}")
print(f"Regression R2 Score: {r2_reg}")
#Step 6: Visualize the Decision Trees
#1. Plot the Decision Tree for Classification
plt.figure(figsize=(20,10))
plot_tree(dt_classifier, filled=True, feature_names=[f'Feature {i}' for i in range(X_class.shape[1])],
class_names=['Class 0', 'Class 1'])
plt.title("Decision Tree - Classification")
plt.show()
#2. Plot the Decision Tree for Regression
plt.figure(figsize=(20,10))
plot_tree(dt_regressor, filled=True, feature_names=[f'Feature {i}' for i in range(X_reg.shape[1])])
plt.title("Decision Tree - Regression")
plt.show()
#Step 7: Visualize the Predictions (for Regression)
plt.figure(figsize=(10, 6))
plt.scatter(X_test_reg, y_test_reg, color='blue', label='Actual')
Page 21 of 28
plt.scatter(X_test_reg, y_pred_reg, color='red', label='Predicted')
plt.title('Decision Tree Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.show()
Page 22 of 28
Program No. : 9
Implementation of Random Forest.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,
mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
#Step 2: Generate or Load Dataset
#1. Classification Dataset
from sklearn.datasets import make_classification
def generate_classification_data(n_samples=100, n_features=4, n_classes=2):
X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
random_state=42)
return X, y
X_class, y_class = generate_classification_data()
#2. Regression Dataset
from sklearn.datasets import make_regression
def generate_regression_data(n_samples=100, n_features=1, noise=0.1):
X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=noise,
random_state=42)
return X, y
X_reg, y_reg = generate_regression_data()
#Step 3: Split the Data
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.3,
random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3,
random_state=42)
#Step 4: Train the Random Forest Models
#1. Random Forest for Classification
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_class, y_train_class)
Page 23 of 28
#2. Random Forest for Regression
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_reg, y_train_reg)
#Step 5: Predict and Evaluate the Models
#1. Classification Model Evaluation
y_pred_class = rf_classifier.predict(X_test_class)
# Accuracy and Confusion Matrix
accuracy_class = accuracy_score(y_test_class, y_pred_class)
cm_class = confusion_matrix(y_test_class, y_pred_class)
cr_class = classification_report(y_test_class, y_pred_class)
print(f"Classification Accuracy: {accuracy_class}")
print("Confusion Matrix:")
print(cm_class)
print("Classification Report:")
print(cr_class)
#2. Regression Model Evaluation
y_pred_reg = rf_regressor.predict(X_test_reg)
# Mean Squared Error and R2 Score
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
r2_reg = r2_score(y_test_reg, y_pred_reg)
print(f"Regression Mean Squared Error: {mse_reg}")
print(f"Regression R2 Score: {r2_reg}")
#Step 6: Feature Importance
#1. Feature Importance for Classification
feature_importance_class = rf_classifier.feature_importances_
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance_class, y=[f'Feature {i}' for i in range(X_class.shape[1])])
plt.title("Feature Importance - Classification")
plt.show()
#2. Feature Importance for Regression
feature_importance_reg = rf_regressor.feature_importances_
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance_reg, y=[f'Feature {i}' for i in range(X_reg.shape[1])])
plt.title("Feature Importance - Regression")
plt.show()
Page 24 of 28
Program No. : 10
Implementation of Principal component analysis.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
#Step 2: Load or Generate Dataset
from sklearn.datasets import load_iris
def load_iris_data():
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
return X, y, feature_names
X, y, feature_names = load_iris_data()
#Step 3: Standardize the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#Step 4: Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
#Step 5: Analyze the Explained Variance
explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance Ratio: {explained_variance}")
print(f"Total Variance Explained by the first 2 components: {np.sum(explained_variance)}")
#Step 6: Visualize the PCA Results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis', s=100)
plt.title('PCA of Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
Page 25 of 28
#Step 7: (Optional) Cumulative Explained Variance Plot
pca_full = PCA().fit(X_scaled)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by Number of Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()
Page 26 of 28
Program No. : 11
Implementation of Singular Value Decomposition.
#Step 1: Import Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.linalg import svd
#Step 2: Generate or Load a Dataset
def create_sample_matrix():
return np.array([
[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[10, 11, 12]
])
matrix = create_sample_matrix()
print("Original Matrix:")
print(matrix)
#Step 3: Apply Singular Value Decomposition
U, Sigma, VT = svd(matrix)
print("U Matrix:")
print(U)
print("Sigma Values:")
print(Sigma)
print("V^T Matrix:")
print(VT)
#Step 4: Reconstruct the Original Matrix
Sigma_matrix = np.diag(Sigma)
reconstructed_matrix = np.dot(U, np.dot(Sigma_matrix, VT))
print("Reconstructed Matrix:")
print(reconstructed_matrix)
#Step 5: Truncated SVD for Dimensionality Reduction
k = 2 # Number of singular values to keep
U_k = U[:, :k]
Page 27 of 28
Sigma_k = np.diag(Sigma[:k])
VT_k = VT[:k, :]
reduced_matrix = np.dot(U_k, np.dot(Sigma_k, VT_k))
print(f"Reduced Matrix using {k} singular values:")
print(reduced_matrix)
#Step 6: Visualize the Singular Values
plt.figure(figsize=(8, 5))
plt.plot(Sigma, marker='o')
plt.title('Singular Values')
plt.xlabel('Index')
plt.ylabel('Singular Value')
plt.grid(True)
plt.show()
Page 28 of 28
Page 29 of 28
Page 30 of 28
By- Ms. Shailja Chaurasiya(Asst. Prof.- Deptt. of CSE)
Page 31 of 24