0% found this document useful (0 votes)
22 views

Anemia Code

Uploaded by

sksharini67
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
22 views

Anemia Code

Uploaded by

sksharini67
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 33

#Print system version

!jupyter --version
import sys
print("Python version:", sys.version)

# importing necessary libraries

import pandas as pd # for data manipulation and analysis


import collections # for creating and manipulating Python's collections like OrderedDict, defaultdict,
Counter, etc.
import numpy as np # for scientific computing with Python
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
import seaborn as sns # for advanced visualization

# Classifier Libraries
from sklearn.linear_model import LogisticRegression # for implementing logistic regression algorithm
from sklearn.tree import DecisionTreeClassifier # for implementing decision tree algorithm
from sklearn.ensemble import RandomForestClassifier # for implementing random forest algorithm
from sklearn.svm import SVC # for implementing Support Vector Machine (SVM) algorithm
from sklearn.naive_bayes import GaussianNB # for implementing Naive Bayes algorithm
from sklearn.neighbors import KNeighborsClassifier # for implementing K-Nearest Neighbors (KNN)
algorithm

# For Statistical testing


from scipy.stats import ttest_ind # for computing t-test for two independent samples
import statsmodels.api as sm # for statistical models and tests
from scipy.stats import chi2_contingency # for computing chi-square statistic and p-value for a
contingency table
import scipy.stats as stats # for implementing skewness and other stats

# Other Libraries
from sklearn.model_selection import train_test_split # for splitting data into training and testing sets
from sklearn.pipeline import make_pipeline # for building a pipeline of transforms with a final
estimator
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline # for building a pipeline
with imbalanced datasets
from imblearn.over_sampling import SMOTE # for oversampling imbalanced datasets using Synthetic
Minority Over-sampling Technique (SMOTE)
from imblearn.under_sampling import NearMiss # for undersampling imbalanced datasets using
NearMiss algorithm
from imblearn.metrics import classification_report_imbalanced # for generating a classification
report for imbalanced datasets
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,
classification_report # for computing various performance metrics for classification models
from collections import Counter # for counting the frequency of elements in a list
from sklearn.model_selection import KFold, StratifiedKFold # for k-fold cross-validation
from sklearn.model_selection import cross_val_score # for evaluating a model using cross-validation
from sklearn.metrics import cohen_kappa_score # for computing Cohen's kappa score for inter-rater
agreement

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000) # for setting the maximum number of columns to
display in pandas dataframes

# first read the data file


df= pd.read_csv('/content/drive/MyDrive/anemia.csv')

df.shape

df.head()

# Print summary statistics


df.describe()

df.shape

df.info()

#columns name
df.columns

#Checking Null
# Import numpy
import numpy as np

# Inspect missing values in the dataset


print(df.isnull().values.sum())

# Replace the ' 's with NaN


df = df.replace(" ",np.NaN)

# Count the number of NaNs in the dataset to verify


print(df.isnull().values.sum())

# Create a copy of the DataFrame to avoid modifying the original data


df_copy = df.copy()

# Rename values in the 'Result' column just for the plot


df_copy['Result'] = df_copy['Result'].replace({0: 'Non Anemic', 1: 'Anemic'})
# Rename values in the 'Gender' column
df_copy['Gender'] = df_copy['Gender'].replace({0: 'Male', 1: 'Female'})

# Define custom hex colors


custom_colors = ['#B43757', '#a37b85']
custom_colors_gender = ['#90ADC6', '#C6A990']
print(df_copy)

result_counts = df_copy['Result'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors,
shadow=True)
plt.title('Distribution of Anemia Result')
plt.show()

# Create a count plot of the anemia result


ax= sns.countplot(x='Result', data=df_copy, palette=custom_colors)
plt.title('Count of Anemia Result')

# Add labels to the bars


for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

plt.show()

result_counts = df_copy['Result'].value_counts()

# Print the counts of the two categories


print(result_counts)

# Check if the two categories are balanced or not


if result_counts[0] == result_counts[1]:
print('The two categories are balanced.')
else:
print('The two categories are not balanced.')

print("-----")
# The classes are heavily skewed we need to solve this issue later.
print('Non Anemic', round(df['Result'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Anemic', round(df['Result'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

ax= sns.countplot(x='Gender', hue ='Result', data=df_copy, palette=custom_colors)


plt.title('Number of Individuals with and without Anemia by Gender')

# Add labels to the bars


for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

plt.show()

result_counts = df_copy['Gender'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors_gender,
shadow=True)
plt.title('Gender distribution ')
plt.show()

df.head()

df_copy.head()

# anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()

# ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=custom_colors_gender)


# ax.set_xticklabels(['Male', 'Female'])
# plt.title('Mean Anemia Rate by Gender')
# plt.xlabel('Gender')
# plt.ylabel('Mean Anemia Rate')
# plt.show()

print(sns.barplot.__doc__)

color_gen = {'0': '#90ADC6', '1': '#C6A990'}


anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()

# Create the bar plot


ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=color_gen)

# Add labels to the bars


for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.2f}'.format(p.get_height()), ha='center')
ax.set_xticklabels(['Male', 'Female'])

# Add plot titles and labels


plt.title('Mean Anemia Rate by Gender | Which gender has more the anemic condition?',
fontsize=16, fontweight='bold')
plt.xlabel('Gender' , fontsize=12)
plt.ylabel('Mean Anemia Rate' , fontsize=12)

# Remove spines
sns.despine(left=True, bottom=True)

# Remove vertical lines from the grid


plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)

plt.figure(figsize=(8, 6))
# Show the plot
plt.show()

anemia_rates = df.groupby('Gender')['Result'].mean().round(2)
anemia_rates

# Create separate subsets for males and females


male_data = df_copy[df_copy['Gender'] == 'Male']
female_data = df_copy[df_copy['Gender'] == 'Female']

# Plot horizontal violinplot using Seaborn


sns.violinplot(x='Hemoglobin', y='Gender', hue='Result', data=df_copy, palette=custom_colors,
inner='quartile', scale='width', cut=0)

# Add mean and median lines


for i, group in enumerate([male_data, female_data]):
median = group['Hemoglobin'].median()
mean = group['Hemoglobin'].mean()
plt.axhline(y=i, xmin=0.05, xmax=0.48, color='black', linewidth=2)
plt.text(0.51, i+0.1, f'Median: {median:.2f}', ha='left', va='center')
plt.text(0.51, i-0.1, f'Mean: {mean:.2f}', ha='left', va='center')

# Add IQR whiskers


q1_male, q3_male = male_data['Hemoglobin'].quantile([0.25, 0.75])
q1_female, q3_female = female_data['Hemoglobin'].quantile([0.25, 0.75])
plt.axhline(y=0, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.axhline(y=1, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.plot([q1_male, q1_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q3_male, q3_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q1_female, q1_female], [0.8, 1.2], color='black', linewidth=2)
plt.plot([q3_female, q3_female], [0.8, 1.2], color='black', linewidth=2)
plt.text((q1_male+q3_male)/2, -0.3, f'IQR: {q3_male-q1_male:.2f}', ha='center', va='center')
plt.text((q1_female+q3_female)/2, 1.3, f'IQR: {q3_female-q1_female:.2f}', ha='center', va='center')

# Add title and labels


plt.title('Distribution of Hemoglobin Levels by Gender')
plt.xlabel('Hemoglobin Level')
plt.ylabel('Gender')

# Show the plot


plt.show()

iqr = np.percentile(df['Hemoglobin'], 75) - np.percentile(df['Hemoglobin'], 25)

# Bin width using the Freedman-Diaconis rule


bin_width = 2 * iqr / (len(df)**(1/3))
sns.distplot(df['Hemoglobin'], hist=True, kde=True,
bins=int(round((df['Hemoglobin'].max() - df['Hemoglobin'].min()) / bin_width)),
color='#d60266',
hist_kws={'edgecolor':'black', 'alpha': 0.8},
kde_kws={'linewidth': 2})

# Add labels and adjust font sizes


#plt.title('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
plt.xlabel('Hemoglobin', fontsize=12)
plt.ylabel('Count', fontsize=12)

# # Add legend
# plt.legend(labels=['Hemoglobin'], loc='upper right')

# Remove spines
sns.despine(left=True, bottom=True)

# Remove vertical lines from the grid


plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)

# Adjust plot size


plt.figure(figsize=(8, 6))
# Show plot
plt.show()

# Calculate skewness using the skew() function


skewness = stats.skew(df['Hemoglobin'])

# Calculate kurtosis using the kurtosis() function False Parameter


kurtosis = stats.kurtosis(df['Hemoglobin'], fisher=False)

# Print the result


print("Skewness:", skewness)
# Print the result
print("Kurtosis:", kurtosis)

# Create a dictionary with the values


hemoglobin_data = {'Metric': ['Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest
Hemoglobin Level'],
'Value': [df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}

# Create a pandas DataFrame from the dictionary


hemoglobin_table = pd.DataFrame(hemoglobin_data)

# Create the table using Seaborn styling


styled_table = (hemoglobin_table.style
.set_caption('Hemoglobin Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter the data by anemia status


anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn


sns.histplot(anemia_data, x='Hemoglobin', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='Hemoglobin', ax=ax2, color='green', binwidth=0.5)

# Set titles and axis labels


ax1.set_title('Hemoglobin Levels in Patients with Anemia', fontsize=14, fontweight='bold')
ax2.set_title('Hemoglobin Levels in Patients without Anemia', fontsize=14, fontweight='bold')
fig.suptitle('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

# Customize tick labels and grid


ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines
sns.despine(left=True, bottom=True)

# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)

# Adjust plot size


plt.tight_layout()

# Show the plot


plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH


MCH_data = {'Metric': ['Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level'],
'Value': [df['MCH'].max(), df['MCH'].mean(), df['MCH'].min()]}

# Create a pandas DataFrame from the dictionary


MCH_table = pd.DataFrame(MCH_data)

# Create the table using Seaborn styling


styled_table = (MCH_table.style
.set_caption('MCH Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter the data by anemia status


anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))

# Plot histograms with Seaborn


sns.histplot(anemia_data, x='MCH', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='MCH', ax=ax2, color='green', binwidth=0.5)

# Set titles and axis labels


ax1.set_title('Mean Corpuscular Hemoglobin Levels in Patients with Anemia', fontsize=14,
fontweight='bold')
ax2.set_title('Mean Corpuscular Hemoglobin Levels in Patients without Anemia', fontsize=14,
fontweight='bold')
fig.suptitle('Distribution of Mean Corpuscular Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

# Customize tick labels and grid


ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines
sns.despine(left=True, bottom=True)

# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)

# Adjust plot size


plt.tight_layout()

# Show the plot


plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH


MCHC_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min()]}

# Create a pandas DataFrame from the dictionary


MCHC_table = pd.DataFrame(MCHC_data)

# Create the table using Seaborn styling


styled_table = (MCHC_table.style
.set_caption('MCHC Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter data for anemia and non-anemia cases


anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn


sns.histplot(anemia_data, x='MCHC', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCHC', ax=ax2, color='green', bins=20)

# Customize tick labels and grid


ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Set titles and axis labels


ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia', fontweight='bold')
fig.suptitle('Mean Corpuscular Hemoglobin Concentration Levels', fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot


plt.show()

# Filter data for anemia and non-anemia cases


anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn


sns.histplot(anemia_data, x='MCV', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCV', ax=ax2, color='green', bins=20)

# Customize tick labels and grid


ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Set titles and axis labels


ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia',fontweight='bold' )
fig.suptitle('Mean Corpuscular Volume Levels',fontweight='bold')
ax1.set_xlabel('Mean Corpuscular VolumeLevel',fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Volume Level',fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot


plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH


MCV_data = {'Metric': ['Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level'],
'Value': [df['MCV'].max(), df['MCV'].mean(), df['MCV'].min()]}
# Create a pandas DataFrame from the dictionary
MCV_table = pd.DataFrame(MCV_data)

# Create the table using Seaborn styling


styled_table = (MCV_table.style
.set_caption('MCV Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# dictionary with the values for MCHC, MCV, MCH, and hemoglobin
blood_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level',
'Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level',
'Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level',
'Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest Hemoglobin Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min(),
df['MCV'].max(), df['MCV'].mean(), df['MCV'].min(),
df['MCH'].max(), df['MCH'].mean(), df['MCH'].min(),
df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}

# Create a pandas df
blood_table = pd.DataFrame(blood_data)

# Create the table


styled_table = (blood_table.style
.set_caption('Blood Test Results')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption', 'props': [('font-size', '18px'),
('font-weight', 'bold'), ('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

sns.set_style("whitegrid")
sns.boxplot(x='Result', y='Hemoglobin', data=df_copy, palette=custom_colors)
plt.title('Distribution of Hemoglobin Levels by Anemia Result')
plt.xlabel('Anemia Result')
plt.ylabel('Hemoglobin Level')

# Remove spines
sns.despine(left=True, bottom=True)
plt.show()

# Set plot style

# Create violin plot using Seaborn


ax = sns.violinplot(x='Result', y='Hemoglobin', hue='Gender', data=df_copy,
palette=custom_colors_gender, split=True)

# Set plot title and axis labels


ax.set_title('Distribution of Hemoglobin Levels by Gender and Anemic Condition', fontsize=14,
fontweight='bold')
ax.set_xlabel('Anemia Result', fontsize=12, fontweight='bold')
ax.set_ylabel('Hemoglobin Level', fontsize=12, fontweight='bold')

# Add legend and adjust its position


ax.legend(title='Gender', title_fontsize=12, fontsize=10, loc='upper right')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot


plt.show()

df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',


'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)

sns.pairplot(df,hue='Result')

sns.set(style="ticks")

RELATIONS_COLS = ["Hemoglobin", "MCH", "MCHC","MCV"]

g = sns.PairGrid(data=df, vars=RELATIONS_COLS, hue="Result", palette=custom_colors)


g.map_diag(sns.kdeplot, shade=True)
g.map_offdiag(sns.regplot, scatter_kws={'alpha':0.5})
g.add_legend(title="Result")
legend = g._legend

# set figure size


g.fig.set_size_inches(12, 12)

# update legend labels


new_labels = ['Non-anemic', 'Anemic']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l)

# legend.texts[0].set_text('Non Anemic')
# legend.texts[1].set_text('Anemic')
# g.fig.suptitle("Relations in the Dataset", y=1.03)
**STATISTICAL TEST **(T-TEST)A t-test is a statistical test used to determine whether there is a
significant difference between the means of two groups. In our case, we are using a t-test to
determine whether there is a significant difference in the mean hemoglobin levels between males
and females.

As we see Hemoglobin have negaive skewness but t-test asumes have normal distribution. So before
performing t-test, we would be taking the logarithm of the data, which can help to reduce the
skewness.

df_stat = df.copy()
df_stat.head()

male_hemoglobin = df_stat.loc[df_stat['Gender'] == 0, 'Hemoglobin']


female_hemoglobin = df_stat.loc[df_stat['Gender'] == 1, 'Hemoglobin']

# Compute the t-test statistic and p-value


t_statistic, p_value = ttest_ind(male_hemoglobin, female_hemoglobin)

# Print the results


print("T-Statistic: {:.2f}".format(t_statistic))
print("P-Value: {:.3f}".format(p_value))

# Compare the p-value with the significance level (0.05)


if p_value < 0.05:
print("Reject null hypothesis: Gender has an impact on hemoglobin levels.")
else:
print("Fail to reject null hypothesis: Gender has no impact on hemoglobin levels.")

**ODDS RATIO**

# Create binary variables for gender and anemia status


df_stat['is_female'] = np.where(df_stat['Gender'] == 1, 1, 0)
df_stat['is_anemic'] = np.where(df_stat['Result'] == 1, 1, 0)

# Fit a logistic regression model with gender and anemia status as predictors
logit_model = sm.Logit(df_stat['is_anemic'], sm.add_constant(df_stat['is_female']))
result = logit_model.fit()

# Print the odds ratio for gender


print("Odds Ratio for Gender: {:.2f}".format(np.exp(result.params[1])))

**chi-square test**

# Create a contingency table of gender and anemia status


cont_table = pd.crosstab(df_stat['Gender'], df_stat['Result'])
# Perform the chi-square test of independence
chi2_statistic, p_value, dof, expected = chi2_contingency(cont_table)

# Print the results


print("Chi-Square Statistic: {:.2f}".format(chi2_statistic))
print("P-Value: {:.3f}".format(p_value))

# Compare the p-value with the significance level (0.05)


if p_value < 0.05:
print("Reject null hypothesis: Gender and anemia status are dependent.")
else:
print("Fail to reject null hypothesis: Gender and anemia status are independent.")

**FEATURE SELECTION**

CORRELATION.............PERSON CORRELATION

df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',


'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)

# create a correlation matrix


corr_matrix = df.corr().round(2)

# plot the correlation matrix using a heatmap from seaborn


sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)
#plt.title('Correlation Matrix', fontweight='bold')
plt.show()

**SELECTKBEST**

import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = df.iloc[:,0:5] # independent columns


y = df.iloc[:,5]

k_values = [2, 3, 4, 5] # different values of K to try


best_k = 0 # variable to keep track of best K value
best_score = 0 # variable to keep track of best score

for k in k_values:
# apply SelectKBest class to extract top k best features
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X, y)

dfscores = pd.DataFrame(fit.scores_) # score for each feature


dfcolumns = pd.DataFrame(X.columns)
# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs','Score'] # naming the dataframe columns

# get the best K value based on score


if featureScores['Score'].sum() > best_score:
best_score = featureScores['Score'].sum()
best_k = k

print(f"The best value of K is {best_k} with score {best_score}.")

print("---")
print(featureScores)
print("---")
print(featureScores.nlargest(3,'Score'))

**Extremely Randomized Trees.**

# Extremely Randomized Trees.


from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization


feat_importances = pd.Series(model.feature_importances_, index=X.columns)

feat_importances.nlargest(3).plot(kind='barh', color='#808080')
plt.xlabel('Importance')
# plt.title('Top 3 Features Importance', fontweight='bold', fontsize=12)
# Remove spines
sns.despine(left=False, bottom=True)
plt.show()
plt.savefig('Top2Feature.jpg')

**SCALING FEATURES**

# Scale Hemoglobin by log


df['Hemoglobin_log'] = np.log(df.Hemoglobin + 0.01)

# Scale Hemoglobin by Standardization


from sklearn.preprocessing import StandardScaler # importing a class from a module of a library

ss = StandardScaler() # object of the class StandardScaler ()


df['Hemoglobin_scaled'] = ss.fit_transform(df['Hemoglobin'].values.reshape(-1,1))

#SCALE BY NORMALIZATION
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler() # object of the class StandardScaler ()
df['Hemoglobin_minmax'] = mm.fit_transform(df['Hemoglobin'].values.reshape(-1,1))

#Feature engineering to a better visualization of the values

# Let's explore the Aby Result and see the distribuition of Hemoglobin
fig , axs = plt.subplots(nrows = 1 , ncols = 4 , figsize = (16,4))

sns.boxplot(x ="Result",y="Hemoglobin",data=df, ax = axs[0])


axs[0].set_title("Result vs Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_log",data=df, ax = axs[1])


axs[1].set_title("Result vs Log Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_scaled",data=df, ax = axs[2])


axs[2].set_title("Result vs Scaled Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_minmax",data=df, ax = axs[3])


axs[3].set_title("Result vs Min Max Hemoglobin")

# fig.suptitle('Amount by Class', fontsize=20)


plt.show()

**Splitting data into Training and Testing samples(70:30)**

df.columns

# Separate Target Variable and Predictor Variables


# Here I am keeping the selected feature only
X = df.drop(['MCHC','Hemoglobin_log', 'Hemoglobin_scaled', 'Hemoglobin_minmax', 'Result',
'MCH'],axis=1)
y = df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True,


random_state=101)

# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_test - ",X_test.shape)
print("y_test - ",y_test.shape)

**CLASSIFICATION MODELS**
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression # Importing Classifier Step

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Model Evolution
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_test)))


print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred)))
# print('Confusion Matrix : \n', cnf_matrix)
print("\n")

# Predicted values counts for Anemic and Non Anemic of test dataset
pd.Series(y_pred).value_counts()

# Actual values counts for Anemic and Non Anemic of test dataset
pd.Series(y_test).value_counts()

183/181

**MODEL EVOLUTION MATRIX**

# confusion matrix

cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix

# Heatmap for Confusion Matrix


p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="autumn"
,fmt='g')

plt.title('Confusion matrix', y=1.1, fontsize = 22)


plt.ylabel('Actual',fontsize = 18)
plt.xlabel('Predicted',fontsize = 18)

# ax.xaxis.set_ticklabels(['Genuine', 'Fraud']);
# ax.yaxis.set_ticklabels(['Genuine', 'Fraud']);

plt.show()

181/181
**ROC**

metrics.roc_auc_score(y_test , y_pred)

y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba

# plot ROC Curve

plt.figure(figsize=(8,6))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)


auc = metrics.roc_auc_score(y_test, y_pred)
print("AUC - ",auc,"\n")

plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc)


plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC curve for anemic cases classification', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.show()

# calculate precision-recall curve


precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred)

print('f1=%.3f' % (f1))

# create figure and axis objects with custom size and padding
fig, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95)

# plot no skill
ax.plot([0, 1], [0.5, 0.5], linestyle='--', color='gray', lw=1)

# plot the precision-recall curve


ax.plot(recall, precision, marker='.', markersize=5)

# set axis labels and title


ax.set_xlabel('Recall', fontsize=14)
ax.set_ylabel('Precision', fontsize=14)
ax.set_title('Precision-Recall Curve for anemic cases classification', fontsize=16)

# show F1 score in the plot


ax.text(0.05, 0.95, f'F1 Score = {f1:.3f}', transform=ax.transAxes, fontsize=14)
# show the plot
plt.show()

# As found in EDA, the response variable Result have unequal data.

# Imbalanced data typically refers to a problem with classification problems where the classes are
not represented equally. If one applies classifiers on the dataset, they are likely to predict everything
as the majority class. This was often regarded as a problem in learning from highly imbalanced
datasets.

# To tackle the imbalance, we will be focuing on

# Random Oversampling
# Random oversampling duplicates examples from the minority class in the training dataset and can
result in overfitting for some models.

# Random undersampling
# Random undersampling deletes examples from the majority class and can result in losing
information invaluable to a model.

# Synthetic Minority OverSampling Technique (SMOTE)


# In this technique, instead of simply duplicating data from the minority class, we synthesize new
data from the minority class. This is a type of data augmentation for tabular data can be very
effective. This approach to synthesizing new data is called the Synthetic Minority Oversampling
TEchnique, or SMOTE for short.

# Adaptive Synthetic Sampling Method for Imbalanced Data (ADASYN)


# ADASYN (Adaptive Synthetic) is an algorithm that generates synthetic data, and its greatest
advantages are not copying the same minority data, and generating more data for “harder to learn”
examples.

# Import imbalace technique algorithims

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,


classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter # counter takes values returns value_counts dictionary
from sklearn.datasets import make_classification

print('Original dataset shape %s' % Counter(y_train))

# Undersampling only on train


rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_rus))

# Undersampling with Logistic Regression


logreg = LogisticRegression()
logreg.fit(X_train_rus, y_train_rus)

y_pred_rus = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred_rus , y_test)))


print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_rus)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_rus)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_rus)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_rus)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_rus)))

# plot ROC Curve


plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_rus)

auc = metrics.roc_auc_score(y_test, y_pred_rus)


print("AUC - ",auc,"\n")

# plot the ROC curve


plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits


plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title


plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Undersampling', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve


precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_rus)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_rus)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model


plt.plot(recall, precision, marker='.')

# add labels and title


plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Undersampling', fontsize=12, fontweight='bold')

# show the plot


plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_rus)


sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="winter"
,fmt='g')

plt.title('Confusion matrix Random Undersampling', y=1.1, fontsize = 12, fontweight='bold')


plt.xlabel('Predicted',fontsize = 12, fontweight='bold')
plt.ylabel('Actual',fontsize = 12, fontweight='bold')

# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);


# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);

plt.show()

from imblearn.over_sampling import RandomOverSampler

print('Original dataset shape %s' % Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_ros))

# Oversampling with Logistic Regression


logreg = LogisticRegression()
logreg.fit(X_train_ros, y_train_ros)

y_pred_ros = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_ros)))


print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_ros)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_ros)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_ros)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_ros)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_ros)))

F1 score of 0.94 on the test set with data leakage and a score of 0.94 without data leakage.

Here, data leakage did not have a significant impact on the model's performance.
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_ros)

auc = metrics.roc_auc_score(y_test, y_pred_ros)


print("AUC - ",auc,"\n")

# plot the ROC curve


plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits


plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title


plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Oversampling', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve


precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_ros)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_ros)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model


plt.plot(recall, precision, marker='.')

# add labels and title


plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Oversampling', fontsize=12, fontweight='bold')

# show the plot


plt.show()

# Heatmap for Confusion Matrix


cnf_matrix = metrics.confusion_matrix(y_test , y_pred_ros)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="summer"
,fmt='g')

plt.title('Confusion matrix Random Oversampling ', y=1.1, fontsize=12, fontweight='bold')


plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)

# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);


# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);

plt.show()

#LOgistic Regression with smote data


from imblearn.over_sampling import SMOTE, ADASYN

print('Original dataset shape %s' % Counter(y_train))

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_smote))

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_smote, y_train_smote)

y_pred_smote = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_smote)))


print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_smote)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_smote)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_smote)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_smote)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_smote)))

# plot ROC Curve


plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_smote)

auc = metrics.roc_auc_score(y_test, y_pred_smote)


print("AUC - ",auc,"\n")

# plot the ROC curve


plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits


plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12, fontweight='bold')
plt.title('ROC curve for LR SMOTE', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve


precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_smote)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_smote)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model


plt.plot(recall, precision, marker='.')

# add labels and title


plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR SMOTE', fontsize=12, fontweight='bold')

# show the plot


plt.show()

# Heatmap for Confusion Matrix


cnf_matrix = metrics.confusion_matrix(y_test , y_pred_smote)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Purples"
,fmt='g')

plt.title('Confusion matrix with SMOTE', y=1.1, fontsize = 12)


plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)

plt.show()

#Logistic Regression with ADASYN data


print('Original dataset shape %s' % Counter(y_train))

adasyn = ADASYN(random_state=42)

X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)


print('Resampled dataset shape %s' % Counter(y_train_adasyn))
# ADASYN Sampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_adasyn, y_train_adasyn)

y_pred_adasyn = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_pred_adasyn)))


print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_adasyn)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_adasyn)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_adasyn)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_adasyn)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_adasyn)))

# plot ROC Curve


plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_adasyn)

auc = metrics.roc_auc_score(y_test, y_pred_adasyn)


print("AUC - ",auc,"\n")

# plot the ROC curve


plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits


plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title


plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12,fontweight='bold')
plt.title('ROC curve for LR ADASYN', fontsize=12,fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve


precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_adasyn)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_adasyn)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model


plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR ADASYN', fontsize=12, fontweight='bold')

# show the plot


plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_adasyn)


sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Wistia"
,fmt='g')

plt.title('Confusion matrix with LR ADASYN', y=1.1, fontsize = 12,fontweight='bold')


plt.xlabel('Predicted',fontsize = 12,fontweight='bold')
plt.ylabel('Actual',fontsize = 12,fontweight='bold')

plt.show()

**DISTRIBUTION OF BALANCED DATA SET (BUILDING DIFFERENT MODELS)**

names_lst = []

# Empty list to capture performance matrix for train set


aucs_train_lst = []
accuracy_train_lst = []
precision_train_lst = []
recall_train_lst = []
f1_train_lst = []

# Empty list to capture performance matrix for test set


aucs_test_lst = []
accuracy_test_lst = []
precision_test_lst = []
recall_test_lst = []
f1_test_lst = []
kappa_lst = []

# Function for model building and performance measure

def build_measure_model(models):
plt.figure(figsize=(12,6))

for name, model, X_train, y_train, X_test, y_test in models:

names_lst.append(name)

# Build model
model.fit(X_train, y_train)
# Predict
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# calculate accuracy
Accuracy_train = metrics.accuracy_score(y_train, y_train_pred)
accuracy_train_lst.append(Accuracy_train)

Accuracy_test = metrics.accuracy_score(y_test, y_test_pred)


accuracy_test_lst.append(Accuracy_test)

# calculate auc
Aucs_train = metrics.roc_auc_score(y_train, y_train_pred)
aucs_train_lst.append(Aucs_train)

Aucs_test = metrics.roc_auc_score(y_test , y_test_pred)


aucs_test_lst.append(Aucs_test)

# calculate precision
PrecisionScore_train = metrics.precision_score(y_train , y_train_pred)
precision_train_lst.append(PrecisionScore_train)

PrecisionScore_test = metrics.precision_score(y_test , y_test_pred)


precision_test_lst.append(PrecisionScore_test)

# calculate recall
RecallScore_train = metrics.recall_score(y_train , y_train_pred)
recall_train_lst.append(RecallScore_train)

RecallScore_test = metrics.recall_score(y_test , y_test_pred)


recall_test_lst.append(RecallScore_test)

# calculate f1 score
F1Score_train = metrics.f1_score(y_train , y_train_pred)
f1_train_lst.append(F1Score_train)

F1Score_test = metrics.f1_score(y_test , y_test_pred)


f1_test_lst.append(F1Score_test)

#print('F1 Score of '+ name +' model : {0:0.5f}'.format(F1Score_test))

# calculate kappa Statictis


kappa = cohen_kappa_score(y_test, y_test_pred)
kappa_lst.append(kappa)

# draw confusion matrix


cnf_matrix = metrics.confusion_matrix(y_test , y_test_pred)
print("Model Name :", name)

print('Train Accuracy :{0:0.5f}'.format(Accuracy_train))


print('Test Accuracy :{0:0.5f}'.format(Accuracy_test))

print('Train AUC : {0:0.5f}'.format(Aucs_train))


print('Test AUC : {0:0.5f}'.format(Aucs_test))

print('Train Precision : {0:0.5f}'.format(PrecisionScore_train))


print('Test Precision : {0:0.5f}'.format(PrecisionScore_test))

print('Train Recall : {0:0.5f}'.format(RecallScore_train))


print('Test Recall : {0:0.5f}'.format(RecallScore_test))

print('Train F1 : {0:0.5f}'.format(F1Score_train))
print('Test F1 : {0:0.5f}'.format(F1Score_test))

print('Kappa Statistic : {0:0.5f}'.format(kappa))

print('Confusion Matrix : \n', cnf_matrix)

print("\n")

# plot ROC Curve


fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred)
auc = metrics.roc_auc_score(y_test, y_test_pred)
plt.plot(fpr,tpr,linewidth=2, label=name + ", auc="+str(auc))

#---------- For loops ends here--------#

plt.legend(loc=4)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
#plt.title('ROC curve for Predicting a anemia cases')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

**DECISION TREE**

DTmodels = []

dt = DecisionTreeClassifier()

DTmodels.append(('DT imbalance', dt,X_train,y_train,X_test,y_test))


DTmodels.append(('DT Undersampling', dt,X_train_rus,y_train_rus,X_test,y_test))
DTmodels.append(('DT Oversampling', dt,X_train_ros,y_train_ros,X_test,y_test))
DTmodels.append(('DT SMOTE', dt,X_train_smote,y_train_smote,X_test,y_test))
DTmodels.append(('DT ADASYN', dt,X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance


build_measure_model(DTmodels)

**RANDOM FOREST**

# Random Forest (RF)


RFmodels = []

RFmodels.append(('RF imbalance', RandomForestClassifier(),X_train,y_train,X_test,y_test))


RFmodels.append(('RF Undersampling',
RandomForestClassifier(),X_train_rus,y_train_rus,X_test,y_test))
RFmodels.append(('RF Oversampling',
RandomForestClassifier(),X_train_ros,y_train_ros,X_test,y_test))
RFmodels.append(('RF SMOTE',
RandomForestClassifier(),X_train_smote,y_train_smote,X_test,y_test))
RFmodels.append(('RF ADASYN',
RandomForestClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance


build_measure_model(RFmodels)

**KNN**

# K-Nearest Neighbors (KNN)


KNNmodels = []

KNNmodels.append(('KNN imbalance', KNeighborsClassifier(),X_train,y_train,X_test,y_test))


KNNmodels.append(('KNN Undersampling',
KNeighborsClassifier(),X_train_rus,y_train_rus,X_test,y_test))
KNNmodels.append(('KNN Oversampling',
KNeighborsClassifier(),X_train_ros,y_train_ros,X_test,y_test))
KNNmodels.append(('KNN SMOTE',
KNeighborsClassifier(),X_train_smote,y_train_smote,X_test,y_test))
KNNmodels.append(('KNN ADASYN',
KNeighborsClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance


build_measure_model(KNNmodels)

**SVM**

# Support Vector Machines (SVM)


SVMmodels = []

SVMmodels.append(('SVM imbalance', SVC(gamma='auto'),X_train,y_train,X_test,y_test))


SVMmodels.append(('SVM Undersampling',
SVC(gamma='auto'),X_train_rus,y_train_rus,X_test,y_test))
SVMmodels.append(('SVM Oversampling',
SVC(gamma='auto'),X_train_ros,y_train_ros,X_test,y_test))
SVMmodels.append(('SVM SMOTE',
SVC(gamma='auto'),X_train_smote,y_train_smote,X_test,y_test))
SVMmodels.append(('SVM ADASYN',
SVC(gamma='auto'),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance


build_measure_model(SVMmodels)

# Gaussian Naive Bayes (NB)


NBmodels = []

NBmodels.append(('NB imbalance', GaussianNB(),X_train,y_train,X_test,y_test))


NBmodels.append(('NB Undersampling', GaussianNB(),X_train_rus,y_train_rus,X_test,y_test))
NBmodels.append(('NB Oversampling', GaussianNB(),X_train_ros,y_train_ros,X_test,y_test))
NBmodels.append(('NB SMOTE', GaussianNB(),X_train_smote,y_train_smote,X_test,y_test))
NBmodels.append(('NB ADASYN', GaussianNB(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance


build_measure_model(NBmodels)

from sklearn.ensemble import RandomForestClassifier


from sklearn import metrics

# Random Forest model training


random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Predict from the test set


y_pred = random_forest.predict(X_test)

# Model evaluation
print(metrics.classification_report(y_test, y_pred))
print('Accuracy: {0:0.5f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUC: {0:0.5f}'.format(metrics.roc_auc_score(y_test, y_pred)))
print('Precision: {0:0.5f}'.format(metrics.precision_score(y_test, y_pred)))
print('Recall: {0:0.5f}'.format(metrics.recall_score(y_test, y_pred)))
print('F1: {0:0.5f}'.format(metrics.f1_score(y_test, y_pred)))

**PERFORMANCE MEASURE OF CLASSSIFIERS**

data = {'Model':names_lst,
#'Accuracy_Train':accuracy_train_lst,
'Accuracy_Test':accuracy_test_lst,
#'AUC_Train':aucs_train_lst,
'AUC_Test':aucs_test_lst,
#'PrecisionScore_Train':precision_train_lst,
'PrecisionScore_Test':precision_test_lst,
#'RecallScore_Train':recall_train_lst,
'RecallScore_Test':recall_test_lst,
#'F1Score_Train':f1_train_lst,
'F1Score_Test':f1_test_lst,
'Kappa Stat' : kappa_lst
}

print("Performance measures of various classifiers: \n")


performance_df = pd.DataFrame(data)
performance_df = performance_df.round(3)
finaltable =
performance_df.sort_values(['F1Score_Test','RecallScore_Test','AUC_Test'],ascending=False)
finaltable

finaltable.to_excel('my_table.xlsx', index=False)

**HYPERPARAMETER TUNING**

# Use GridSearchCV to find the best parameters.


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# define the classifiers to be evaluated


classifiers_ = [DecisionTreeClassifier(),
RandomForestClassifier(),
SVC(),
GaussianNB(),
LogisticRegression(),
KNeighborsClassifier()]

# define the parameter grids for each classifier


param_grids = [{'max_depth': range(1, 10), 'criterion': ['gini', 'entropy']}, # decision tree
{'n_estimators': [50, 100, 200], 'max_depth': range(1, 10)}, # random forest
{'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}, # SVM
{}, # naive bayes - no hyperparameters to tune
# Naive Bayes is a probabilistic classifier that is based on Bayes' theorem and the "naive" assumption
that
# the presence or absence of a particular feature is independent of the presence or absence of any
other feature.
# Naive Bayes makes no assumptions about the distribution of the data, unlike other classifiers like
decision trees,
# SVM, or logistic regression

{'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, #logistic
{"n_neighbors": list(range(2,60,1)),'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}] # KNN
regression
# perform GridSearchCV for each classifier
for clf, param_grid in zip(classifiers_, param_grids):
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
print(clf.__class__.__name__, "best params:", grid_search.best_params_, "best score:",
grid_search.best_score_)
print("F1 score:", f1_score(y_test, y_pred))

**CROSS VALIDATION**

# perform 5-fold cross-validation for each classifier


for clf in classifiers_:
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(clf.__class__.__name__, "mean accuracy:", scores.mean(), "std deviation:", scores.std())

from sklearn.tree import export_graphviz


import graphviz

#hyperparametered DT
dt_tuning = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

# Train the decision tree based on selected parameter


dt_tuning.fit(X_train, y_train)
# Generate a DOT file representing the decision tree
dot_data = export_graphviz(dt_tuning, out_file=None, feature_names=X_train.columns,
class_names=['Non Anemic', 'Anemic'], filled=True, rounded=True)

# Visualize the decision path for the new data point


graph = graphviz.Source(dot_data)
graph.render('decision_tree') # save the decision tree as a PDF file
graph

# Testing data

new_data = { 'Gender': 1, 'Hemoglobin': 11, 'MCV':50}


new_X = pd.DataFrame([new_data])
prediction = dt.predict(new_X)
print('Prediction:', prediction)

import matplotlib.pyplot as plt

# Define the labels, scores and colors for each model


labels = ['Decision Tree', 'Random Forest', 'SVM', 'Naive Bayes', 'Logistic Regression', 'KNN']
scores = [1.00, 1.00, 0.994, 0.914, 0.935, 0.988,0.975]
colors = ['#50BFE6','#9C51B6','#FF5470','#0066CC','#FF5050','#E97451']

# Sort the scores and labels in descending order


sorted_scores, sorted_labels = zip(*sorted(zip(scores, labels), reverse=True))

# Set up the plot


fig, ax = plt.subplots(figsize=(12,8))
ax.bar(sorted_labels, sorted_scores, color=colors)

# Set the title and axis labels


ax.set_title('Comparison of Model Performance Grid search', fontsize=12, fontweight='bold')
ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')

# Set the tick font size


ax.tick_params(axis='both', which='major', labelsize=12)

# Add the accuracy score as text above each bar


for i, score in enumerate(sorted_scores):
ax.text(i, score+0.01, f'{score*100:.1f}%', fontsize=12, ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

# Display the plot


plt.show()

**PLOTTING ACCURACY**

import pickle
from sklearn.ensemble import RandomForestClassifier

# Save the Random Forest model as a pickle file


filename = 'random_forest_model.pkl'
pickle.dump(random_forest, open(filename, 'wb'))

print("Random Forest model exported as pickle file:", filename)

You might also like