Title
No category Today 8:07 PM
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import
PCA
# Load the Iris dataset
iris = datasets.load_iris()
data, columns = iris.data,
iris.feature_names
# Create a Pandas DataFrame from
the Iris dataset
df = pd.DataFrame(data,
columns=columns)
# Step 1: Mean normalize the features
normalized_data = (df - df.mean()) /
df.std()
# Step 2: Find the covariance matrix
covariance_matrix =
np.cov(normalized_data,
rowvar=False)
# Step 3: Find eigenvalues and
eigenvectors of the covariance matrix
eigenvalues, eigenvectors =
np.linalg.eig(covariance_matrix)
# Step 4: Arrange eigenvalues in
descending order
sorted_indices =
np.argsort(eigenvalues)[::-1]
sorted_eigenvalues =
eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:,
sorted_indices]
# Step 5: Select eigenvalues that
retain the required variance
total_variance =
np.sum(sorted_eigenvalues)
variance_to_retain = 0.95
cumulative_variance =
np.cumsum(sorted_eigenvalues) /
total_variance
num_components_to_retain =
np.argmax(cumulative_variance >=
variance_to_retain) + 1
selected_eigenvalues =
sorted_eigenvalues[:num_component
s_to_retain]
selected_eigenvectors =
sorted_eigenvectors[:, :num_compone
nts_to_retain]
# Step 6: Transform original data
using eigen vectors corresponding to
selected eigenvalues
transformed_data =
np.dot(normalized_data,
selected_eigenvectors)
# Print the results
print(f"Number of components to
retain {variance_to_retain * 100}%
variance:
{num_components_to_retain}")
print("Explained variance ratio:",
selected_eigenvalues /
total_variance)
# Interpret which features influenced
the principal components the most
feature_contributions =
np.abs(selected_eigenvectors) /
np.sum(np.abs(selected_eigenvectors
), axis=0)
feature_contributions_df =
pd.DataFrame(feature_contributions,
index=columns,
columns=[f'PC{i + 1}' for i in
range(num_components_to_retain)])
print("\nFeature contributions to
Principal Components:")
print(feature_contributions_df)
Code 02
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import
PCA
def load_iris_data():
iris = datasets.load_iris()
data, columns = iris.data,
iris.feature_names
return pd.DataFrame(data,
columns=columns), iris.target
def display_correlation_matrix(df):
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)
def normalize_features(df):
return (df - df.mean()) / df.std()
def perform_pca(data, target):
pca = PCA(n_components=0.95)
transformed_data =
pca.fit_transform(data)
plot_before_after_pca(data,
transformed_data, target)
display_pca_info(pca)
display_feature_contributions(pca.co
mponents_, data.columns)
def
plot_before_after_pca(original_data,
transformed_data, target):
plt.figure(figsize=(12, 6))
# Original Data
plt.subplot(1, 2, 1)
plt.scatter(original_data.iloc[:, 0],
original_data.iloc[:, 1], c=target,
cmap='Set1')
plt.title('Original Data')
plt.xlabel('Feature 01')
plt.ylabel('Feature 02')
# Data after PCA
plt.subplot(1, 2, 2)
plt.scatter(transformed_data[:, 0],
transformed_data[:, 1], c=target,
cmap='Set1')
plt.title('Data after PCA')
plt.xlabel('Principal Component 01')
plt.ylabel('Principal Component 02')
plt.savefig('output_plot.png')
plt.show()
def display_pca_info(pca):
print(f"\nNumber of components to
retain 95% variance:
{pca.n_components_}")
print("Explained variance ratio:",
pca.explained_variance_ratio_)
def
display_feature_contributions(compo
nents, columns):
feature_contributions =
np.abs(components) /
np.sum(np.abs(components), axis=1)
[:, np.newaxis]
feature_contributions_df =
pd.DataFrame(feature_contributions.T
, index=columns,
columns=[f'PC{i + 1}' for i in
range(components.shape[0])])
print("\nFeature contributions to
Principal Components:")
print(feature_contributions_df)
# Plotting the feature contributions
plt.figure(figsize=(12, 6))
for i in
range(components.shape[0]):
plt.subplot(1,
components.shape[0], i + 1)
plt.bar(columns,
feature_contributions_df.iloc[:, i])
plt.title(f'PC{i + 1} Feature
Contributions')
plt.xlabel('Original Features')
plt.ylabel('Contribution')
plt.savefig('output_plot_feature_contri
butions.png')
plt.show()
def main():
iris_data, target = load_iris_data()
display_correlation_matrix(iris_data)
normalized_data =
normalize_features(iris_data)
perform_pca(normalized_data,
target)
if __name__ == "__main__":
main()
Code 03
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import
PCA
# Load the Iris dataset
iris = datasets.load_iris()
data, columns = iris.data,
iris.feature_names
# Create a Pandas DataFrame from
the Iris dataset
df = pd.DataFrame(data,
columns=columns)
# Display the correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)
# Mean normalize the features
normalized_data = (df - df.mean()) /
df.std()
# Perform PCA using Scikit-learn
pca = PCA(n_components=0.95) #
Retain 95% variance
transformed_data =
pca.fit_transform(normalized_data)
# Plotting the data before and after
PCA
plt.figure(figsize=(12, 6))
# Original Data
plt.subplot(1, 2, 1)
plt.scatter(normalized_data.iloc[:, 0],
normalized_data.iloc[:, 1],
c=iris.target, cmap='Set1')
plt.title('Original Data')
plt.xlabel('Feature 01')
plt.ylabel('Feature 02')
# Data after PCA
plt.subplot(1, 2, 2)
plt.scatter(transformed_data[:, 0],
transformed_data[:, 1], c=iris.target,
cmap='Set1')
plt.title('Data after PCA')
plt.xlabel('Principal Component 01')
plt.ylabel('Principal Component 02')
plt.savefig('output_plot.png')
plt.show()
# Display the number of components
and explained variance ratio
print(f"\nNumber of components to
retain 95% variance:
{pca.n_components_}")
print("Explained variance ratio:",
pca.explained_variance_ratio_)
# Interpreting which features
influenced the principal components
the most
feature_contributions =
np.abs(pca.components_) /
np.sum(np.abs(pca.components_),
axis=1)[:, np.newaxis]
feature_contributions_df =
pd.DataFrame(feature_contributions.T
, index=columns,
columns=[f'PC{i + 1}' for i in
range(pca.n_components_)])
print("\nFeature contributions to
Principal Components:")
print(feature_contributions_df)
# Plotting the feature contributions
plt.figure(figsize=(12, 6))
for i in range(pca.n_components_):
plt.subplot(1, pca.n_components_, i
+ 1)
plt.bar(columns,
feature_contributions_df.iloc[:, i])
plt.title(f'PC{i + 1} Feature
Contributions')
plt.xlabel('Original Features')
plt.ylabel('Contribution')
plt.savefig('output_plot_feature_contri
butions.png')
plt.show()