Code:
#!/usr/bin/env python
import numpy as np
from tqdm import tqdm
import csv, logging, re
from sklearn.svm import SVC
from collections import Counter
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.metrics import precision_score, recall_score,
accuracy_score, classification_report
class Data(object):
"""Class responsible for interfacing with our data, e.g., getting the
data, stats, etc."""
def _init_(self, res_path, cls_path, dataType):
self.dataType = dataType
self._get_classes(cls_path)
self._get_tumor_samples(res_path)
self._clean()
def _get_classes(self, path):
print(f"Getting {self.dataType} classes")
with open(path, 'r') as f:
reader = [l.strip() for l in tqdm(f.readlines())]
self.number_of_samples = int(reader[0].split(' ')[0])
self.number_of_classes = int(reader[0].split(' ')[1])
self.classes = reader[1].split(' ')
self.Y = np.array(reader[2].split(' '))
def _get_tumor_samples(self, path):
print(f"Getting {self.dataType} samples")
with open(path, 'r') as inputFile:
lines = [l.strip().split('\t') for l in tqdm(inputFile.readlines())]
data = np.array(lines[3:], dtype=object)
self.feature_names = data[:, 1]
data = data[:, 2:]
data = np.delete(data, list(range(1, data.shape[1], 2)), axis=1)
self.X = data.astype(float).T
def _get_binary(self, name):
"""Returns binary labels for a given class name."""
try:
index = self.classes.index(name)
return np.array([1 if c == str(index) else 0 for c in self.Y])
except ValueError:
return np.zeros_like(self.Y, dtype=int)
def _describe(self):
print(f"\n------ Data {self.dataType} Description -----")
print(f"X len = {len(self.X)}")
print(f"Y len = {len(self.Y)}")
print(f"# Samples = {self.number_of_samples}")
print(f"# Classes = {self.number_of_classes}")
print("---------------------------------\n")
def _clean(self):
"""Remove invalid labels (e.g., class 14 if it is not needed)."""
invalid_indices = np.where(self.Y == '14')[0]
if len(invalid_indices) > 0:
print("Removing invalid entries...")
self.Y = np.delete(self.Y, invalid_indices, 0)
self.X = np.delete(self.X, invalid_indices, 0)
def feature_selection(X, y, k_val):
"""Selects the top k best features using ANOVA F-score."""
best_indices = SelectKBest(f_classif, k=k_val).fit(X,
y).get_support(indices=True)
return best_indices
def plot_coefficients(classifier, feature_names, class_name,
top_features=20):
"""Plots the top features contributing to classification."""
coef = classifier.coef_[0]
top_positive_coefficients = np.argsort(coef)[-top_features:]
top_negative_coefficients = np.argsort(coef)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients,
top_positive_coefficients])
# Create plot
plt.figure(figsize=(30, 15))
colors = ['#cccccc' if c < 0 else 'teal' for c in coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), coef[top_coefficients],
color=colors)
feature_names = np.array(feature_names)[top_coefficients]
plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names,
rotation='vertical', ha='right')
plt.savefig(f"graphs/plot_{class_name}.png")
def run_test(train, test):
"""Runs the classification pipeline."""
train._describe()
test._describe()
normalizer = preprocessing.StandardScaler().fit(train.X)
train.X = normalizer.transform(train.X)
test.X = normalizer.transform(test.X)
y_train = train.Y.astype(int)
y_test = test.Y.astype(int)
X_train = train.X
X_test = test.X
accuracy = []
for x in range(1, 51):
best_features = set()
for cls in train.classes:
binary_labels = train._get_binary(cls)
features = feature_selection(train.X, binary_labels, x)
best_features.update(features)
best_features = list(best_features)
X_train_selected = train.X[:, best_features]
X_test_selected = test.X[:, best_features]
model = SVC(kernel="linear", probability=True)
model.fit(X_train_selected, y_train)
results = model.predict(X_test_selected)
acc = accuracy_score(y_test, results)
accuracy.append(acc)
print(f"Iteration {x}: Accuracy = {acc:.4f}")
print(classification_report(y_test, results))
print("Max Accuracy:", np.max(accuracy))
print("Best feature count:", np.argmax(accuracy) + 1)
if _name_ == '_main_':
logging.basicConfig(level=logging.INFO)
train = Data('data/Training_res.txt', 'data/Training_cls.txt', 'train')
test = Data('data/Test_res.txt', 'data/Test_cls.txt', 'test')
run_test(train, test)
Output:
Datasets:
Bladder
Breast
Cns
Colorectal
Leukemia
Lung
Lymphoma
Melanoma
Mesotheliona
Ovary
Pancreas
Prostate
Renal
Uterus