NF Assighment4

Download as pdf or txt
Download as pdf or txt
You are on page 1of 5

Welcome To Colab - Colab https://colab.research.google.com/#scrollTo=7-debQKryIG3&printMod...

import pandas as pd

# Load the dataset


data = pd.read_csv('annthyroid_21feat_normalised.csv')

# Preprocess the data if needed


# (e.g., handle missing values, scale the features)

# Split the data into features and labels


X = data.drop(columns=['class']) # Features
y = data['class'] # Labels (assuming 'class' column indicates anomaly)

# Instantiate the PCA Model


pca_model = PCAModel()

# Train the model


pca_model.train(X, None, num_features=2) # No validation data needed for PCA

# Compute anomaly scores for the entire dataset


anomaly_scores = pca_model.compute_anomaly_score(X)

# Print or further analyze the anomaly scores


print("Anomaly scores:")
print(anomaly_scores)

Explained variation per principal component: 0.46743936111388296


Anomaly scores:
0 0.048096
1 0.006211
2 0.002051
3 0.001602
4 0.005633
...
7195 0.045009
7196 0.005748
7197 0.045752
7198 0.002172
7199 0.047087
Length: 7200, dtype: float64

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Load the dataset


data = pd.read_csv('annthyroid_21feat_normalised.csv')

1 of 5 4/30/2024, 12:05 PM
Welcome To Colab - Colab https://colab.research.google.com/#scrollTo=7-debQKryIG3&printMod...

data = pd.read_csv('annthyroid_21feat_normalised.csv')

# Preprocess the data if needed


# (e.g., handle missing values, scale the features)

# Split the data into features and labels


X = data.drop(columns=['class']) # Features
y = data['class'] # Labels (assuming 'class' column indicates anomaly)

# Split the data into training and validation sets


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and train the model (example with Random Forest)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict labels on the validation set


y_pred = model.predict(X_val)

# Calculate performance metrics


accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')
report = classification_report(y_val, y_pred)

# Print or log the metrics


print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Classification Report:\n", report)

# Optionally, you can include other metrics such as precision, recall, confusion matrix, etc.

Accuracy: 0.9993055555555556
F1 Score: 0.9993036997916849
Classification Report:
precision recall f1-score support

0 1.00 1.00 1.00 1352


1 1.00 0.99 0.99 88

accuracy 1.00 1440


macro avg 1.00 0.99 1.00 1440
weighted avg 1.00 1.00 1.00 1440

import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.metrics import confusion_matrix

# Plot confusion matrix


cm = confusion_matrix(y_val, y_pred)

2 of 5 4/30/2024, 12:05 PM
Welcome To Colab - Colab https://colab.research.google.com/#scrollTo=7-debQKryIG3&printMod...

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Anomaly'], yticklabels
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Plot feature importances (if using RandomForestClassifier)


if isinstance(model, RandomForestClassifier):
feature_importances = model.feature_importances_
feature_names = X.columns
sorted_idx = feature_importances.argsort()

plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_idx)), feature_importances[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()

# Plot ROC curve (if applicable)


# Note: For multi-class classification, you may need to use one-vs-rest or one-vs-all strategy
# and calculate ROC curve and AUC for each class separately
# Example:
# from sklearn.metrics import roc_curve, auc
# fpr, tpr, thresholds = roc_curve(y_val, y_pred)
# roc_auc = auc(fpr, tpr)
# plt.figure()
# plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC)')
# plt.legend(loc="lower right")
# plt.show()

# Additional plots or charts as needed

3 of 5 4/30/2024, 12:05 PM
Welcome To Colab - Colab https://colab.research.google.com/#scrollTo=7-debQKryIG3&printMod...

4 of 5 4/30/2024, 12:05 PM
Welcome To Colab - Colab https://colab.research.google.com/#scrollTo=7-debQKryIG3&printMod...

5 of 5 4/30/2024, 12:05 PM

You might also like