Anemia Code
Anemia Code
!jupyter --version
import sys
print("Python version:", sys.version)
# Classifier Libraries
from sklearn.linear_model import LogisticRegression # for implementing logistic regression algorithm
from sklearn.tree import DecisionTreeClassifier # for implementing decision tree algorithm
from sklearn.ensemble import RandomForestClassifier # for implementing random forest algorithm
from sklearn.svm import SVC # for implementing Support Vector Machine (SVM) algorithm
from sklearn.naive_bayes import GaussianNB # for implementing Naive Bayes algorithm
from sklearn.neighbors import KNeighborsClassifier # for implementing K-Nearest Neighbors (KNN)
algorithm
# Other Libraries
from sklearn.model_selection import train_test_split # for splitting data into training and testing sets
from sklearn.pipeline import make_pipeline # for building a pipeline of transforms with a final
estimator
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline # for building a pipeline
with imbalanced datasets
from imblearn.over_sampling import SMOTE # for oversampling imbalanced datasets using Synthetic
Minority Over-sampling Technique (SMOTE)
from imblearn.under_sampling import NearMiss # for undersampling imbalanced datasets using
NearMiss algorithm
from imblearn.metrics import classification_report_imbalanced # for generating a classification
report for imbalanced datasets
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,
classification_report # for computing various performance metrics for classification models
from collections import Counter # for counting the frequency of elements in a list
from sklearn.model_selection import KFold, StratifiedKFold # for k-fold cross-validation
from sklearn.model_selection import cross_val_score # for evaluating a model using cross-validation
from sklearn.metrics import cohen_kappa_score # for computing Cohen's kappa score for inter-rater
agreement
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000) # for setting the maximum number of columns to
display in pandas dataframes
df.shape
df.head()
df.shape
df.info()
#columns name
df.columns
#Checking Null
# Import numpy
import numpy as np
result_counts = df_copy['Result'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors,
shadow=True)
plt.title('Distribution of Anemia Result')
plt.show()
# Remove spines
sns.despine(left=True, bottom=True)
plt.show()
result_counts = df_copy['Result'].value_counts()
print("-----")
# The classes are heavily skewed we need to solve this issue later.
print('Non Anemic', round(df['Result'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Anemic', round(df['Result'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
# Remove spines
sns.despine(left=True, bottom=True)
plt.show()
result_counts = df_copy['Gender'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors_gender,
shadow=True)
plt.title('Gender distribution ')
plt.show()
df.head()
df_copy.head()
# anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()
print(sns.barplot.__doc__)
# Remove spines
sns.despine(left=True, bottom=True)
plt.figure(figsize=(8, 6))
# Show the plot
plt.show()
anemia_rates = df.groupby('Gender')['Result'].mean().round(2)
anemia_rates
# # Add legend
# plt.legend(labels=['Hemoglobin'], loc='upper right')
# Remove spines
sns.despine(left=True, bottom=True)
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))
# Remove spines
sns.despine(left=True, bottom=True)
# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))
# Remove spines
sns.despine(left=True, bottom=True)
# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))
# Remove spines
sns.despine(left=True, bottom=True)
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))
# Remove spines
sns.despine(left=True, bottom=True)
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# dictionary with the values for MCHC, MCV, MCH, and hemoglobin
blood_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level',
'Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level',
'Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level',
'Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest Hemoglobin Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min(),
df['MCV'].max(), df['MCV'].mean(), df['MCV'].min(),
df['MCH'].max(), df['MCH'].mean(), df['MCH'].min(),
df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}
# Create a pandas df
blood_table = pd.DataFrame(blood_data)
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
sns.set_style("whitegrid")
sns.boxplot(x='Result', y='Hemoglobin', data=df_copy, palette=custom_colors)
plt.title('Distribution of Hemoglobin Levels by Anemia Result')
plt.xlabel('Anemia Result')
plt.ylabel('Hemoglobin Level')
# Remove spines
sns.despine(left=True, bottom=True)
plt.show()
# Remove spines
sns.despine(left=True, bottom=True)
sns.pairplot(df,hue='Result')
sns.set(style="ticks")
# legend.texts[0].set_text('Non Anemic')
# legend.texts[1].set_text('Anemic')
# g.fig.suptitle("Relations in the Dataset", y=1.03)
**STATISTICAL TEST **(T-TEST)A t-test is a statistical test used to determine whether there is a
significant difference between the means of two groups. In our case, we are using a t-test to
determine whether there is a significant difference in the mean hemoglobin levels between males
and females.
As we see Hemoglobin have negaive skewness but t-test asumes have normal distribution. So before
performing t-test, we would be taking the logarithm of the data, which can help to reduce the
skewness.
df_stat = df.copy()
df_stat.head()
**ODDS RATIO**
# Fit a logistic regression model with gender and anemia status as predictors
logit_model = sm.Logit(df_stat['is_anemic'], sm.add_constant(df_stat['is_female']))
result = logit_model.fit()
**chi-square test**
**FEATURE SELECTION**
CORRELATION.............PERSON CORRELATION
**SELECTKBEST**
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
for k in k_values:
# apply SelectKBest class to extract top k best features
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X, y)
print("---")
print(featureScores)
print("---")
print(featureScores.nlargest(3,'Score'))
feat_importances.nlargest(3).plot(kind='barh', color='#808080')
plt.xlabel('Importance')
# plt.title('Top 3 Features Importance', fontweight='bold', fontsize=12)
# Remove spines
sns.despine(left=False, bottom=True)
plt.show()
plt.savefig('Top2Feature.jpg')
**SCALING FEATURES**
#SCALE BY NORMALIZATION
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler() # object of the class StandardScaler ()
df['Hemoglobin_minmax'] = mm.fit_transform(df['Hemoglobin'].values.reshape(-1,1))
# Let's explore the Aby Result and see the distribuition of Hemoglobin
fig , axs = plt.subplots(nrows = 1 , ncols = 4 , figsize = (16,4))
df.columns
# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_test - ",X_test.shape)
print("y_test - ",y_test.shape)
**CLASSIFICATION MODELS**
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression # Importing Classifier Step
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
# Model Evolution
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
# Predicted values counts for Anemic and Non Anemic of test dataset
pd.Series(y_pred).value_counts()
# Actual values counts for Anemic and Non Anemic of test dataset
pd.Series(y_test).value_counts()
183/181
# confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix
# ax.xaxis.set_ticklabels(['Genuine', 'Fraud']);
# ax.yaxis.set_ticklabels(['Genuine', 'Fraud']);
plt.show()
181/181
**ROC**
metrics.roc_auc_score(y_test , y_pred)
y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba
plt.figure(figsize=(8,6))
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred)
print('f1=%.3f' % (f1))
# create figure and axis objects with custom size and padding
fig, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95)
# plot no skill
ax.plot([0, 1], [0.5, 0.5], linestyle='--', color='gray', lw=1)
# Imbalanced data typically refers to a problem with classification problems where the classes are
not represented equally. If one applies classifiers on the dataset, they are likely to predict everything
as the majority class. This was often regarded as a problem in learning from highly imbalanced
datasets.
# Random Oversampling
# Random oversampling duplicates examples from the minority class in the training dataset and can
result in overfitting for some models.
# Random undersampling
# Random undersampling deletes examples from the majority class and can result in losing
information invaluable to a model.
from collections import Counter # counter takes values returns value_counts dictionary
from sklearn.datasets import make_classification
y_pred_rus = logreg.predict(X_test)
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_rus)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.show()
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
y_pred_ros = logreg.predict(X_test)
F1 score of 0.94 on the test set with data leakage and a score of 0.94 without data leakage.
Here, data leakage did not have a significant impact on the model's performance.
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_ros)
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_ros)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.show()
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_smote, y_train_smote)
y_pred_smote = logreg.predict(X_test)
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_smote)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.show()
adasyn = ADASYN(random_state=42)
y_pred_adasyn = logreg.predict(X_test)
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_adasyn)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.show()
names_lst = []
def build_measure_model(models):
plt.figure(figsize=(12,6))
names_lst.append(name)
# Build model
model.fit(X_train, y_train)
# Predict
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# calculate accuracy
Accuracy_train = metrics.accuracy_score(y_train, y_train_pred)
accuracy_train_lst.append(Accuracy_train)
# calculate auc
Aucs_train = metrics.roc_auc_score(y_train, y_train_pred)
aucs_train_lst.append(Aucs_train)
# calculate precision
PrecisionScore_train = metrics.precision_score(y_train , y_train_pred)
precision_train_lst.append(PrecisionScore_train)
# calculate recall
RecallScore_train = metrics.recall_score(y_train , y_train_pred)
recall_train_lst.append(RecallScore_train)
# calculate f1 score
F1Score_train = metrics.f1_score(y_train , y_train_pred)
f1_train_lst.append(F1Score_train)
print('Train F1 : {0:0.5f}'.format(F1Score_train))
print('Test F1 : {0:0.5f}'.format(F1Score_test))
print("\n")
plt.legend(loc=4)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
#plt.title('ROC curve for Predicting a anemia cases')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()
**DECISION TREE**
DTmodels = []
dt = DecisionTreeClassifier()
**RANDOM FOREST**
**KNN**
**SVM**
# Model evaluation
print(metrics.classification_report(y_test, y_pred))
print('Accuracy: {0:0.5f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUC: {0:0.5f}'.format(metrics.roc_auc_score(y_test, y_pred)))
print('Precision: {0:0.5f}'.format(metrics.precision_score(y_test, y_pred)))
print('Recall: {0:0.5f}'.format(metrics.recall_score(y_test, y_pred)))
print('F1: {0:0.5f}'.format(metrics.f1_score(y_test, y_pred)))
data = {'Model':names_lst,
#'Accuracy_Train':accuracy_train_lst,
'Accuracy_Test':accuracy_test_lst,
#'AUC_Train':aucs_train_lst,
'AUC_Test':aucs_test_lst,
#'PrecisionScore_Train':precision_train_lst,
'PrecisionScore_Test':precision_test_lst,
#'RecallScore_Train':recall_train_lst,
'RecallScore_Test':recall_test_lst,
#'F1Score_Train':f1_train_lst,
'F1Score_Test':f1_test_lst,
'Kappa Stat' : kappa_lst
}
finaltable.to_excel('my_table.xlsx', index=False)
**HYPERPARAMETER TUNING**
{'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, #logistic
{"n_neighbors": list(range(2,60,1)),'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}] # KNN
regression
# perform GridSearchCV for each classifier
for clf, param_grid in zip(classifiers_, param_grids):
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
print(clf.__class__.__name__, "best params:", grid_search.best_params_, "best score:",
grid_search.best_score_)
print("F1 score:", f1_score(y_test, y_pred))
**CROSS VALIDATION**
#hyperparametered DT
dt_tuning = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
# Testing data
# Remove spines
sns.despine(left=True, bottom=True)
**PLOTTING ACCURACY**
import pickle
from sklearn.ensemble import RandomForestClassifier