0% found this document useful (0 votes)

22 views

Anemia Code

Uploaded by

sksharini67

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

22 views

Anemia Code

Uploaded by

sksharini67

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 33

#Print system version

!jupyter --version
import sys
print("Python version:", sys.version)

# importing necessary libraries

import pandas as pd # for data manipulation and analysis

import collections # for creating and manipulating Python's collections like OrderedDict, defaultdict,
Counter, etc.
import numpy as np # for scientific computing with Python
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
import seaborn as sns # for advanced visualization

# Classifier Libraries
from sklearn.linear_model import LogisticRegression # for implementing logistic regression algorithm
from sklearn.tree import DecisionTreeClassifier # for implementing decision tree algorithm
from sklearn.ensemble import RandomForestClassifier # for implementing random forest algorithm
from sklearn.svm import SVC # for implementing Support Vector Machine (SVM) algorithm
from sklearn.naive_bayes import GaussianNB # for implementing Naive Bayes algorithm
from sklearn.neighbors import KNeighborsClassifier # for implementing K-Nearest Neighbors (KNN)
algorithm

# For Statistical testing

from scipy.stats import ttest_ind # for computing t-test for two independent samples
import statsmodels.api as sm # for statistical models and tests
from scipy.stats import chi2_contingency # for computing chi-square statistic and p-value for a
contingency table
import scipy.stats as stats # for implementing skewness and other stats

# Other Libraries
from sklearn.model_selection import train_test_split # for splitting data into training and testing sets
from sklearn.pipeline import make_pipeline # for building a pipeline of transforms with a final
estimator
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline # for building a pipeline
with imbalanced datasets
from imblearn.over_sampling import SMOTE # for oversampling imbalanced datasets using Synthetic
Minority Over-sampling Technique (SMOTE)
from imblearn.under_sampling import NearMiss # for undersampling imbalanced datasets using
NearMiss algorithm
from imblearn.metrics import classification_report_imbalanced # for generating a classification
report for imbalanced datasets
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,
classification_report # for computing various performance metrics for classification models
from collections import Counter # for counting the frequency of elements in a list
from sklearn.model_selection import KFold, StratifiedKFold # for k-fold cross-validation
from sklearn.model_selection import cross_val_score # for evaluating a model using cross-validation
from sklearn.metrics import cohen_kappa_score # for computing Cohen's kappa score for inter-rater
agreement

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000) # for setting the maximum number of columns to
display in pandas dataframes

# first read the data file

df= pd.read_csv('/content/drive/MyDrive/anemia.csv')

df.shape

df.head()

# Print summary statistics

df.describe()

df.shape

df.info()

#columns name
df.columns

#Checking Null
# Import numpy
import numpy as np

# Inspect missing values in the dataset

print(df.isnull().values.sum())

# Replace the ' 's with NaN

df = df.replace(" ",np.NaN)

# Count the number of NaNs in the dataset to verify

print(df.isnull().values.sum())

# Create a copy of the DataFrame to avoid modifying the original data

df_copy = df.copy()

# Rename values in the 'Result' column just for the plot

df_copy['Result'] = df_copy['Result'].replace({0: 'Non Anemic', 1: 'Anemic'})
# Rename values in the 'Gender' column
df_copy['Gender'] = df_copy['Gender'].replace({0: 'Male', 1: 'Female'})

# Define custom hex colors

custom_colors = ['#B43757', '#a37b85']
custom_colors_gender = ['#90ADC6', '#C6A990']
print(df_copy)

result_counts = df_copy['Result'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors,
shadow=True)
plt.title('Distribution of Anemia Result')
plt.show()

# Create a count plot of the anemia result

ax= sns.countplot(x='Result', data=df_copy, palette=custom_colors)
plt.title('Count of Anemia Result')

# Add labels to the bars

for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

plt.show()

result_counts = df_copy['Result'].value_counts()

# Print the counts of the two categories

print(result_counts)

# Check if the two categories are balanced or not

if result_counts[0] == result_counts[1]:
print('The two categories are balanced.')
else:
print('The two categories are not balanced.')

print("-----")
# The classes are heavily skewed we need to solve this issue later.
print('Non Anemic', round(df['Result'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Anemic', round(df['Result'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

ax= sns.countplot(x='Gender', hue ='Result', data=df_copy, palette=custom_colors)

plt.title('Number of Individuals with and without Anemia by Gender')

# Add labels to the bars

for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

plt.show()

result_counts = df_copy['Gender'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors_gender,
shadow=True)
plt.title('Gender distribution ')
plt.show()

df.head()

df_copy.head()

# anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()

# ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=custom_colors_gender)

# ax.set_xticklabels(['Male', 'Female'])
# plt.title('Mean Anemia Rate by Gender')
# plt.xlabel('Gender')
# plt.ylabel('Mean Anemia Rate')
# plt.show()

print(sns.barplot.__doc__)

color_gen = {'0': '#90ADC6', '1': '#C6A990'}

anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()

# Create the bar plot

ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=color_gen)

# Add labels to the bars

for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.2f}'.format(p.get_height()), ha='center')
ax.set_xticklabels(['Male', 'Female'])

# Add plot titles and labels

plt.title('Mean Anemia Rate by Gender | Which gender has more the anemic condition?',
fontsize=16, fontweight='bold')
plt.xlabel('Gender' , fontsize=12)
plt.ylabel('Mean Anemia Rate' , fontsize=12)

# Remove spines
sns.despine(left=True, bottom=True)

# Remove vertical lines from the grid

plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)

plt.figure(figsize=(8, 6))
# Show the plot
plt.show()

anemia_rates = df.groupby('Gender')['Result'].mean().round(2)
anemia_rates

# Create separate subsets for males and females

male_data = df_copy[df_copy['Gender'] == 'Male']
female_data = df_copy[df_copy['Gender'] == 'Female']

# Plot horizontal violinplot using Seaborn

sns.violinplot(x='Hemoglobin', y='Gender', hue='Result', data=df_copy, palette=custom_colors,
inner='quartile', scale='width', cut=0)

# Add mean and median lines

for i, group in enumerate([male_data, female_data]):
median = group['Hemoglobin'].median()
mean = group['Hemoglobin'].mean()
plt.axhline(y=i, xmin=0.05, xmax=0.48, color='black', linewidth=2)
plt.text(0.51, i+0.1, f'Median: {median:.2f}', ha='left', va='center')
plt.text(0.51, i-0.1, f'Mean: {mean:.2f}', ha='left', va='center')

# Add IQR whiskers

q1_male, q3_male = male_data['Hemoglobin'].quantile([0.25, 0.75])
q1_female, q3_female = female_data['Hemoglobin'].quantile([0.25, 0.75])
plt.axhline(y=0, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.axhline(y=1, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.plot([q1_male, q1_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q3_male, q3_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q1_female, q1_female], [0.8, 1.2], color='black', linewidth=2)
plt.plot([q3_female, q3_female], [0.8, 1.2], color='black', linewidth=2)
plt.text((q1_male+q3_male)/2, -0.3, f'IQR: {q3_male-q1_male:.2f}', ha='center', va='center')
plt.text((q1_female+q3_female)/2, 1.3, f'IQR: {q3_female-q1_female:.2f}', ha='center', va='center')

# Add title and labels

plt.title('Distribution of Hemoglobin Levels by Gender')
plt.xlabel('Hemoglobin Level')
plt.ylabel('Gender')

# Show the plot

plt.show()

iqr = np.percentile(df['Hemoglobin'], 75) - np.percentile(df['Hemoglobin'], 25)

# Bin width using the Freedman-Diaconis rule

bin_width = 2 * iqr / (len(df)**(1/3))
sns.distplot(df['Hemoglobin'], hist=True, kde=True,
bins=int(round((df['Hemoglobin'].max() - df['Hemoglobin'].min()) / bin_width)),
color='#d60266',
hist_kws={'edgecolor':'black', 'alpha': 0.8},
kde_kws={'linewidth': 2})

# Add labels and adjust font sizes

#plt.title('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
plt.xlabel('Hemoglobin', fontsize=12)
plt.ylabel('Count', fontsize=12)

# # Add legend
# plt.legend(labels=['Hemoglobin'], loc='upper right')

# Remove spines
sns.despine(left=True, bottom=True)

# Remove vertical lines from the grid

plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)

# Adjust plot size

plt.figure(figsize=(8, 6))
# Show plot
plt.show()

# Calculate skewness using the skew() function

skewness = stats.skew(df['Hemoglobin'])

# Calculate kurtosis using the kurtosis() function False Parameter

kurtosis = stats.kurtosis(df['Hemoglobin'], fisher=False)

# Print the result

print("Skewness:", skewness)
# Print the result
print("Kurtosis:", kurtosis)

# Create a dictionary with the values

hemoglobin_data = {'Metric': ['Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest
Hemoglobin Level'],
'Value': [df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}

# Create a pandas DataFrame from the dictionary

hemoglobin_table = pd.DataFrame(hemoglobin_data)

# Create the table using Seaborn styling

styled_table = (hemoglobin_table.style
.set_caption('Hemoglobin Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter the data by anemia status

anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='Hemoglobin', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='Hemoglobin', ax=ax2, color='green', binwidth=0.5)

# Set titles and axis labels

ax1.set_title('Hemoglobin Levels in Patients with Anemia', fontsize=14, fontweight='bold')
ax2.set_title('Hemoglobin Levels in Patients without Anemia', fontsize=14, fontweight='bold')
fig.suptitle('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines
sns.despine(left=True, bottom=True)

# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)

# Adjust plot size

plt.tight_layout()

# Show the plot

plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH

MCH_data = {'Metric': ['Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level'],
'Value': [df['MCH'].max(), df['MCH'].mean(), df['MCH'].min()]}

# Create a pandas DataFrame from the dictionary

MCH_table = pd.DataFrame(MCH_data)

# Create the table using Seaborn styling

styled_table = (MCH_table.style
.set_caption('MCH Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter the data by anemia status

anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='MCH', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='MCH', ax=ax2, color='green', binwidth=0.5)

# Set titles and axis labels

ax1.set_title('Mean Corpuscular Hemoglobin Levels in Patients with Anemia', fontsize=14,
fontweight='bold')
ax2.set_title('Mean Corpuscular Hemoglobin Levels in Patients without Anemia', fontsize=14,
fontweight='bold')
fig.suptitle('Distribution of Mean Corpuscular Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines
sns.despine(left=True, bottom=True)

# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)

# Adjust plot size

plt.tight_layout()

# Show the plot

plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH

MCHC_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min()]}

# Create a pandas DataFrame from the dictionary

MCHC_table = pd.DataFrame(MCHC_data)

# Create the table using Seaborn styling

styled_table = (MCHC_table.style
.set_caption('MCHC Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter data for anemia and non-anemia cases

anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='MCHC', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCHC', ax=ax2, color='green', bins=20)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Set titles and axis labels

ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia', fontweight='bold')
fig.suptitle('Mean Corpuscular Hemoglobin Concentration Levels', fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot

plt.show()

# Filter data for anemia and non-anemia cases

anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='MCV', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCV', ax=ax2, color='green', bins=20)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Set titles and axis labels

ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia',fontweight='bold' )
fig.suptitle('Mean Corpuscular Volume Levels',fontweight='bold')
ax1.set_xlabel('Mean Corpuscular VolumeLevel',fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Volume Level',fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot

plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH

MCV_data = {'Metric': ['Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level'],
'Value': [df['MCV'].max(), df['MCV'].mean(), df['MCV'].min()]}
# Create a pandas DataFrame from the dictionary
MCV_table = pd.DataFrame(MCV_data)

# Create the table using Seaborn styling

styled_table = (MCV_table.style
.set_caption('MCV Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# dictionary with the values for MCHC, MCV, MCH, and hemoglobin
blood_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level',
'Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level',
'Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level',
'Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest Hemoglobin Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min(),
df['MCV'].max(), df['MCV'].mean(), df['MCV'].min(),
df['MCH'].max(), df['MCH'].mean(), df['MCH'].min(),
df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}

# Create a pandas df
blood_table = pd.DataFrame(blood_data)

# Create the table

styled_table = (blood_table.style
.set_caption('Blood Test Results')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption', 'props': [('font-size', '18px'),
('font-weight', 'bold'), ('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

sns.set_style("whitegrid")
sns.boxplot(x='Result', y='Hemoglobin', data=df_copy, palette=custom_colors)
plt.title('Distribution of Hemoglobin Levels by Anemia Result')
plt.xlabel('Anemia Result')
plt.ylabel('Hemoglobin Level')

# Remove spines
sns.despine(left=True, bottom=True)
plt.show()

# Set plot style

# Create violin plot using Seaborn

ax = sns.violinplot(x='Result', y='Hemoglobin', hue='Gender', data=df_copy,
palette=custom_colors_gender, split=True)

# Set plot title and axis labels

ax.set_title('Distribution of Hemoglobin Levels by Gender and Anemic Condition', fontsize=14,
fontweight='bold')
ax.set_xlabel('Anemia Result', fontsize=12, fontweight='bold')
ax.set_ylabel('Hemoglobin Level', fontsize=12, fontweight='bold')

# Add legend and adjust its position

ax.legend(title='Gender', title_fontsize=12, fontsize=10, loc='upper right')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot

plt.show()

df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',

'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)

sns.pairplot(df,hue='Result')

sns.set(style="ticks")

RELATIONS_COLS = ["Hemoglobin", "MCH", "MCHC","MCV"]

g = sns.PairGrid(data=df, vars=RELATIONS_COLS, hue="Result", palette=custom_colors)

g.map_diag(sns.kdeplot, shade=True)
g.map_offdiag(sns.regplot, scatter_kws={'alpha':0.5})
g.add_legend(title="Result")
legend = g._legend

# set figure size

g.fig.set_size_inches(12, 12)

# update legend labels

new_labels = ['Non-anemic', 'Anemic']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l)

# legend.texts[0].set_text('Non Anemic')
# legend.texts[1].set_text('Anemic')
# g.fig.suptitle("Relations in the Dataset", y=1.03)
**STATISTICAL TEST **(T-TEST)A t-test is a statistical test used to determine whether there is a
significant difference between the means of two groups. In our case, we are using a t-test to
determine whether there is a significant difference in the mean hemoglobin levels between males
and females.

As we see Hemoglobin have negaive skewness but t-test asumes have normal distribution. So before
performing t-test, we would be taking the logarithm of the data, which can help to reduce the
skewness.

df_stat = df.copy()
df_stat.head()

male_hemoglobin = df_stat.loc[df_stat['Gender'] == 0, 'Hemoglobin']

female_hemoglobin = df_stat.loc[df_stat['Gender'] == 1, 'Hemoglobin']

# Compute the t-test statistic and p-value

t_statistic, p_value = ttest_ind(male_hemoglobin, female_hemoglobin)

# Print the results

print("T-Statistic: {:.2f}".format(t_statistic))
print("P-Value: {:.3f}".format(p_value))

# Compare the p-value with the significance level (0.05)

if p_value < 0.05:
print("Reject null hypothesis: Gender has an impact on hemoglobin levels.")
else:
print("Fail to reject null hypothesis: Gender has no impact on hemoglobin levels.")

**ODDS RATIO**

# Create binary variables for gender and anemia status

df_stat['is_female'] = np.where(df_stat['Gender'] == 1, 1, 0)
df_stat['is_anemic'] = np.where(df_stat['Result'] == 1, 1, 0)

# Fit a logistic regression model with gender and anemia status as predictors
logit_model = sm.Logit(df_stat['is_anemic'], sm.add_constant(df_stat['is_female']))
result = logit_model.fit()

# Print the odds ratio for gender

print("Odds Ratio for Gender: {:.2f}".format(np.exp(result.params[1])))

**chi-square test**

# Create a contingency table of gender and anemia status

cont_table = pd.crosstab(df_stat['Gender'], df_stat['Result'])
# Perform the chi-square test of independence
chi2_statistic, p_value, dof, expected = chi2_contingency(cont_table)

# Print the results

print("Chi-Square Statistic: {:.2f}".format(chi2_statistic))
print("P-Value: {:.3f}".format(p_value))

# Compare the p-value with the significance level (0.05)

if p_value < 0.05:
print("Reject null hypothesis: Gender and anemia status are dependent.")
else:
print("Fail to reject null hypothesis: Gender and anemia status are independent.")

**FEATURE SELECTION**

CORRELATION.............PERSON CORRELATION

df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',

'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)

# create a correlation matrix

corr_matrix = df.corr().round(2)

# plot the correlation matrix using a heatmap from seaborn

sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)
#plt.title('Correlation Matrix', fontweight='bold')
plt.show()

**SELECTKBEST**

import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = df.iloc[:,0:5] # independent columns

y = df.iloc[:,5]

k_values = [2, 3, 4, 5] # different values of K to try

best_k = 0 # variable to keep track of best K value
best_score = 0 # variable to keep track of best score

for k in k_values:
# apply SelectKBest class to extract top k best features
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X, y)

dfscores = pd.DataFrame(fit.scores_) # score for each feature

dfcolumns = pd.DataFrame(X.columns)
# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs','Score'] # naming the dataframe columns

# get the best K value based on score

if featureScores['Score'].sum() > best_score:
best_score = featureScores['Score'].sum()
best_k = k

print(f"The best value of K is {best_k} with score {best_score}.")

print("---")
print(featureScores)
print("---")
print(featureScores.nlargest(3,'Score'))

Extremely Randomized Trees.

# Extremely Randomized Trees.

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization

feat_importances = pd.Series(model.feature_importances_, index=X.columns)

feat_importances.nlargest(3).plot(kind='barh', color='#808080')
plt.xlabel('Importance')
# plt.title('Top 3 Features Importance', fontweight='bold', fontsize=12)
# Remove spines
sns.despine(left=False, bottom=True)
plt.show()
plt.savefig('Top2Feature.jpg')

**SCALING FEATURES**

# Scale Hemoglobin by log

df['Hemoglobin_log'] = np.log(df.Hemoglobin + 0.01)

# Scale Hemoglobin by Standardization

from sklearn.preprocessing import StandardScaler # importing a class from a module of a library

ss = StandardScaler() # object of the class StandardScaler ()

df['Hemoglobin_scaled'] = ss.fit_transform(df['Hemoglobin'].values.reshape(-1,1))

#SCALE BY NORMALIZATION
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler() # object of the class StandardScaler ()
df['Hemoglobin_minmax'] = mm.fit_transform(df['Hemoglobin'].values.reshape(-1,1))

#Feature engineering to a better visualization of the values

# Let's explore the Aby Result and see the distribuition of Hemoglobin
fig , axs = plt.subplots(nrows = 1 , ncols = 4 , figsize = (16,4))

sns.boxplot(x ="Result",y="Hemoglobin",data=df, ax = axs[0])

axs[0].set_title("Result vs Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_log",data=df, ax = axs[1])

axs[1].set_title("Result vs Log Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_scaled",data=df, ax = axs[2])

axs[2].set_title("Result vs Scaled Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_minmax",data=df, ax = axs[3])

axs[3].set_title("Result vs Min Max Hemoglobin")

# fig.suptitle('Amount by Class', fontsize=20)

plt.show()

Splitting data into Training and Testing samples(70:30)

df.columns

# Separate Target Variable and Predictor Variables

# Here I am keeping the selected feature only
X = df.drop(['MCHC','Hemoglobin_log', 'Hemoglobin_scaled', 'Hemoglobin_minmax', 'Result',
'MCH'],axis=1)
y = df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True,

random_state=101)

# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_test - ",X_test.shape)
print("y_test - ",y_test.shape)

**CLASSIFICATION MODELS**
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression # Importing Classifier Step

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Model Evolution
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_test)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred)))
# print('Confusion Matrix : \n', cnf_matrix)
print("\n")

# Predicted values counts for Anemic and Non Anemic of test dataset
pd.Series(y_pred).value_counts()

# Actual values counts for Anemic and Non Anemic of test dataset
pd.Series(y_test).value_counts()

183/181

MODEL EVOLUTION MATRIX

# confusion matrix

cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix

# Heatmap for Confusion Matrix

p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="autumn"
,fmt='g')

plt.title('Confusion matrix', y=1.1, fontsize = 22)

plt.ylabel('Actual',fontsize = 18)
plt.xlabel('Predicted',fontsize = 18)

# ax.xaxis.set_ticklabels(['Genuine', 'Fraud']);
# ax.yaxis.set_ticklabels(['Genuine', 'Fraud']);

plt.show()

181/181
**ROC**

metrics.roc_auc_score(y_test , y_pred)

y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba

# plot ROC Curve

plt.figure(figsize=(8,6))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

auc = metrics.roc_auc_score(y_test, y_pred)
print("AUC - ",auc,"\n")

plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc)

plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC curve for anemic cases classification', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred)

print('f1=%.3f' % (f1))

# create figure and axis objects with custom size and padding
fig, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95)

# plot no skill
ax.plot([0, 1], [0.5, 0.5], linestyle='--', color='gray', lw=1)

# plot the precision-recall curve

ax.plot(recall, precision, marker='.', markersize=5)

# set axis labels and title

ax.set_xlabel('Recall', fontsize=14)
ax.set_ylabel('Precision', fontsize=14)
ax.set_title('Precision-Recall Curve for anemic cases classification', fontsize=16)

# show F1 score in the plot

ax.text(0.05, 0.95, f'F1 Score = {f1:.3f}', transform=ax.transAxes, fontsize=14)
# show the plot
plt.show()

# As found in EDA, the response variable Result have unequal data.

# Imbalanced data typically refers to a problem with classification problems where the classes are
not represented equally. If one applies classifiers on the dataset, they are likely to predict everything
as the majority class. This was often regarded as a problem in learning from highly imbalanced
datasets.

# To tackle the imbalance, we will be focuing on

# Random Oversampling
# Random oversampling duplicates examples from the minority class in the training dataset and can
result in overfitting for some models.

# Random undersampling
# Random undersampling deletes examples from the majority class and can result in losing
information invaluable to a model.

# Synthetic Minority OverSampling Technique (SMOTE)

# In this technique, instead of simply duplicating data from the minority class, we synthesize new
data from the minority class. This is a type of data augmentation for tabular data can be very
effective. This approach to synthesizing new data is called the Synthetic Minority Oversampling
TEchnique, or SMOTE for short.

# Adaptive Synthetic Sampling Method for Imbalanced Data (ADASYN)

# ADASYN (Adaptive Synthetic) is an algorithm that generates synthetic data, and its greatest
advantages are not copying the same minority data, and generating more data for “harder to learn”
examples.

# Import imbalace technique algorithims

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,

classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter # counter takes values returns value_counts dictionary
from sklearn.datasets import make_classification

print('Original dataset shape %s' % Counter(y_train))

# Undersampling only on train

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_rus))

# Undersampling with Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train_rus, y_train_rus)

y_pred_rus = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred_rus , y_test)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_rus)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_rus)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_rus)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_rus)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_rus)))

# plot ROC Curve

plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_rus)

auc = metrics.roc_auc_score(y_test, y_pred_rus)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title

plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Undersampling', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_rus)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_rus)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')

# add labels and title

plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Undersampling', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_rus)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="winter"
,fmt='g')

plt.title('Confusion matrix Random Undersampling', y=1.1, fontsize = 12, fontweight='bold')

plt.xlabel('Predicted',fontsize = 12, fontweight='bold')
plt.ylabel('Actual',fontsize = 12, fontweight='bold')

# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);

# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);

plt.show()

from imblearn.over_sampling import RandomOverSampler

print('Original dataset shape %s' % Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_ros))

# Oversampling with Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train_ros, y_train_ros)

y_pred_ros = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_ros)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_ros)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_ros)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_ros)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_ros)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_ros)))

F1 score of 0.94 on the test set with data leakage and a score of 0.94 without data leakage.

Here, data leakage did not have a significant impact on the model's performance.
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_ros)

auc = metrics.roc_auc_score(y_test, y_pred_ros)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title

plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Oversampling', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_ros)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_ros)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')

# add labels and title

plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Oversampling', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_ros)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="summer"
,fmt='g')

plt.title('Confusion matrix Random Oversampling ', y=1.1, fontsize=12, fontweight='bold')

plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)

# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);

# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);

plt.show()

#LOgistic Regression with smote data

from imblearn.over_sampling import SMOTE, ADASYN

print('Original dataset shape %s' % Counter(y_train))

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_smote))

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_smote, y_train_smote)

y_pred_smote = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_smote)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_smote)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_smote)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_smote)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_smote)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_smote)))

# plot ROC Curve

plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_smote)

auc = metrics.roc_auc_score(y_test, y_pred_smote)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12, fontweight='bold')
plt.title('ROC curve for LR SMOTE', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_smote)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_smote)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')

# add labels and title

plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR SMOTE', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_smote)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Purples"
,fmt='g')

plt.title('Confusion matrix with SMOTE', y=1.1, fontsize = 12)

plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)

plt.show()

#Logistic Regression with ADASYN data

print('Original dataset shape %s' % Counter(y_train))

adasyn = ADASYN(random_state=42)

X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_adasyn))
# ADASYN Sampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_adasyn, y_train_adasyn)

y_pred_adasyn = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_pred_adasyn)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_adasyn)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_adasyn)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_adasyn)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_adasyn)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_adasyn)))

# plot ROC Curve

plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_adasyn)

auc = metrics.roc_auc_score(y_test, y_pred_adasyn)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title

plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12,fontweight='bold')
plt.title('ROC curve for LR ADASYN', fontsize=12,fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_adasyn)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_adasyn)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR ADASYN', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_adasyn)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Wistia"
,fmt='g')

plt.title('Confusion matrix with LR ADASYN', y=1.1, fontsize = 12,fontweight='bold')

plt.xlabel('Predicted',fontsize = 12,fontweight='bold')
plt.ylabel('Actual',fontsize = 12,fontweight='bold')

plt.show()

DISTRIBUTION OF BALANCED DATA SET (BUILDING DIFFERENT MODELS)

names_lst = []

# Empty list to capture performance matrix for train set

aucs_train_lst = []
accuracy_train_lst = []
precision_train_lst = []
recall_train_lst = []
f1_train_lst = []

# Empty list to capture performance matrix for test set

aucs_test_lst = []
accuracy_test_lst = []
precision_test_lst = []
recall_test_lst = []
f1_test_lst = []
kappa_lst = []

# Function for model building and performance measure

def build_measure_model(models):
plt.figure(figsize=(12,6))

for name, model, X_train, y_train, X_test, y_test in models:

names_lst.append(name)

# Build model
model.fit(X_train, y_train)
# Predict
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# calculate accuracy
Accuracy_train = metrics.accuracy_score(y_train, y_train_pred)
accuracy_train_lst.append(Accuracy_train)

Accuracy_test = metrics.accuracy_score(y_test, y_test_pred)

accuracy_test_lst.append(Accuracy_test)

# calculate auc
Aucs_train = metrics.roc_auc_score(y_train, y_train_pred)
aucs_train_lst.append(Aucs_train)

Aucs_test = metrics.roc_auc_score(y_test , y_test_pred)

aucs_test_lst.append(Aucs_test)

# calculate precision
PrecisionScore_train = metrics.precision_score(y_train , y_train_pred)
precision_train_lst.append(PrecisionScore_train)

PrecisionScore_test = metrics.precision_score(y_test , y_test_pred)

precision_test_lst.append(PrecisionScore_test)

# calculate recall
RecallScore_train = metrics.recall_score(y_train , y_train_pred)
recall_train_lst.append(RecallScore_train)

RecallScore_test = metrics.recall_score(y_test , y_test_pred)

recall_test_lst.append(RecallScore_test)

# calculate f1 score
F1Score_train = metrics.f1_score(y_train , y_train_pred)
f1_train_lst.append(F1Score_train)

F1Score_test = metrics.f1_score(y_test , y_test_pred)

f1_test_lst.append(F1Score_test)

#print('F1 Score of '+ name +' model : {0:0.5f}'.format(F1Score_test))

# calculate kappa Statictis

kappa = cohen_kappa_score(y_test, y_test_pred)
kappa_lst.append(kappa)

# draw confusion matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_test_pred)
print("Model Name :", name)

print('Train Accuracy :{0:0.5f}'.format(Accuracy_train))

print('Test Accuracy :{0:0.5f}'.format(Accuracy_test))

print('Train AUC : {0:0.5f}'.format(Aucs_train))

print('Test AUC : {0:0.5f}'.format(Aucs_test))

print('Train Precision : {0:0.5f}'.format(PrecisionScore_train))

print('Test Precision : {0:0.5f}'.format(PrecisionScore_test))

print('Train Recall : {0:0.5f}'.format(RecallScore_train))

print('Test Recall : {0:0.5f}'.format(RecallScore_test))

print('Train F1 : {0:0.5f}'.format(F1Score_train))
print('Test F1 : {0:0.5f}'.format(F1Score_test))

print('Kappa Statistic : {0:0.5f}'.format(kappa))

print('Confusion Matrix : \n', cnf_matrix)

print("\n")

# plot ROC Curve

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred)
auc = metrics.roc_auc_score(y_test, y_test_pred)
plt.plot(fpr,tpr,linewidth=2, label=name + ", auc="+str(auc))

#---------- For loops ends here--------#

plt.legend(loc=4)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
#plt.title('ROC curve for Predicting a anemia cases')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

**DECISION TREE**

DTmodels = []

dt = DecisionTreeClassifier()

DTmodels.append(('DT imbalance', dt,X_train,y_train,X_test,y_test))

DTmodels.append(('DT Undersampling', dt,X_train_rus,y_train_rus,X_test,y_test))
DTmodels.append(('DT Oversampling', dt,X_train_ros,y_train_ros,X_test,y_test))
DTmodels.append(('DT SMOTE', dt,X_train_smote,y_train_smote,X_test,y_test))
DTmodels.append(('DT ADASYN', dt,X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(DTmodels)

**RANDOM FOREST**

# Random Forest (RF)

RFmodels = []

RFmodels.append(('RF imbalance', RandomForestClassifier(),X_train,y_train,X_test,y_test))

RFmodels.append(('RF Undersampling',
RandomForestClassifier(),X_train_rus,y_train_rus,X_test,y_test))
RFmodels.append(('RF Oversampling',
RandomForestClassifier(),X_train_ros,y_train_ros,X_test,y_test))
RFmodels.append(('RF SMOTE',
RandomForestClassifier(),X_train_smote,y_train_smote,X_test,y_test))
RFmodels.append(('RF ADASYN',
RandomForestClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(RFmodels)

**KNN**

# K-Nearest Neighbors (KNN)

KNNmodels = []

KNNmodels.append(('KNN imbalance', KNeighborsClassifier(),X_train,y_train,X_test,y_test))

KNNmodels.append(('KNN Undersampling',
KNeighborsClassifier(),X_train_rus,y_train_rus,X_test,y_test))
KNNmodels.append(('KNN Oversampling',
KNeighborsClassifier(),X_train_ros,y_train_ros,X_test,y_test))
KNNmodels.append(('KNN SMOTE',
KNeighborsClassifier(),X_train_smote,y_train_smote,X_test,y_test))
KNNmodels.append(('KNN ADASYN',
KNeighborsClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(KNNmodels)

**SVM**

# Support Vector Machines (SVM)

SVMmodels = []

SVMmodels.append(('SVM imbalance', SVC(gamma='auto'),X_train,y_train,X_test,y_test))

SVMmodels.append(('SVM Undersampling',
SVC(gamma='auto'),X_train_rus,y_train_rus,X_test,y_test))
SVMmodels.append(('SVM Oversampling',
SVC(gamma='auto'),X_train_ros,y_train_ros,X_test,y_test))
SVMmodels.append(('SVM SMOTE',
SVC(gamma='auto'),X_train_smote,y_train_smote,X_test,y_test))
SVMmodels.append(('SVM ADASYN',
SVC(gamma='auto'),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(SVMmodels)

# Gaussian Naive Bayes (NB)

NBmodels = []

NBmodels.append(('NB imbalance', GaussianNB(),X_train,y_train,X_test,y_test))

NBmodels.append(('NB Undersampling', GaussianNB(),X_train_rus,y_train_rus,X_test,y_test))
NBmodels.append(('NB Oversampling', GaussianNB(),X_train_ros,y_train_ros,X_test,y_test))
NBmodels.append(('NB SMOTE', GaussianNB(),X_train_smote,y_train_smote,X_test,y_test))
NBmodels.append(('NB ADASYN', GaussianNB(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(NBmodels)

from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

# Random Forest model training

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Predict from the test set

y_pred = random_forest.predict(X_test)

# Model evaluation
print(metrics.classification_report(y_test, y_pred))
print('Accuracy: {0:0.5f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUC: {0:0.5f}'.format(metrics.roc_auc_score(y_test, y_pred)))
print('Precision: {0:0.5f}'.format(metrics.precision_score(y_test, y_pred)))
print('Recall: {0:0.5f}'.format(metrics.recall_score(y_test, y_pred)))
print('F1: {0:0.5f}'.format(metrics.f1_score(y_test, y_pred)))

PERFORMANCE MEASURE OF CLASSSIFIERS

data = {'Model':names_lst,
#'Accuracy_Train':accuracy_train_lst,
'Accuracy_Test':accuracy_test_lst,
#'AUC_Train':aucs_train_lst,
'AUC_Test':aucs_test_lst,
#'PrecisionScore_Train':precision_train_lst,
'PrecisionScore_Test':precision_test_lst,
#'RecallScore_Train':recall_train_lst,
'RecallScore_Test':recall_test_lst,
#'F1Score_Train':f1_train_lst,
'F1Score_Test':f1_test_lst,
'Kappa Stat' : kappa_lst
}

print("Performance measures of various classifiers: \n")

performance_df = pd.DataFrame(data)
performance_df = performance_df.round(3)
finaltable =
performance_df.sort_values(['F1Score_Test','RecallScore_Test','AUC_Test'],ascending=False)
finaltable

finaltable.to_excel('my_table.xlsx', index=False)

**HYPERPARAMETER TUNING**

# Use GridSearchCV to find the best parameters.

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# define the classifiers to be evaluated

classifiers_ = [DecisionTreeClassifier(),
RandomForestClassifier(),
SVC(),
GaussianNB(),
LogisticRegression(),
KNeighborsClassifier()]

# define the parameter grids for each classifier

param_grids = [{'max_depth': range(1, 10), 'criterion': ['gini', 'entropy']}, # decision tree
{'n_estimators': [50, 100, 200], 'max_depth': range(1, 10)}, # random forest
{'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}, # SVM
{}, # naive bayes - no hyperparameters to tune
# Naive Bayes is a probabilistic classifier that is based on Bayes' theorem and the "naive" assumption
that
# the presence or absence of a particular feature is independent of the presence or absence of any
other feature.
# Naive Bayes makes no assumptions about the distribution of the data, unlike other classifiers like
decision trees,
# SVM, or logistic regression

{'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, #logistic
{"n_neighbors": list(range(2,60,1)),'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}] # KNN
regression
# perform GridSearchCV for each classifier
for clf, param_grid in zip(classifiers_, param_grids):
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
print(clf.__class__.__name__, "best params:", grid_search.best_params_, "best score:",
grid_search.best_score_)
print("F1 score:", f1_score(y_test, y_pred))

**CROSS VALIDATION**

# perform 5-fold cross-validation for each classifier

for clf in classifiers_:
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(clf.__class__.__name__, "mean accuracy:", scores.mean(), "std deviation:", scores.std())

from sklearn.tree import export_graphviz

import graphviz

#hyperparametered DT
dt_tuning = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

# Train the decision tree based on selected parameter

dt_tuning.fit(X_train, y_train)
# Generate a DOT file representing the decision tree
dot_data = export_graphviz(dt_tuning, out_file=None, feature_names=X_train.columns,
class_names=['Non Anemic', 'Anemic'], filled=True, rounded=True)

# Visualize the decision path for the new data point

graph = graphviz.Source(dot_data)
graph.render('decision_tree') # save the decision tree as a PDF file
graph

# Testing data

new_data = { 'Gender': 1, 'Hemoglobin': 11, 'MCV':50}

new_X = pd.DataFrame([new_data])
prediction = dt.predict(new_X)
print('Prediction:', prediction)

import matplotlib.pyplot as plt

# Define the labels, scores and colors for each model

labels = ['Decision Tree', 'Random Forest', 'SVM', 'Naive Bayes', 'Logistic Regression', 'KNN']
scores = [1.00, 1.00, 0.994, 0.914, 0.935, 0.988,0.975]
colors = ['#50BFE6','#9C51B6','#FF5470','#0066CC','#FF5050','#E97451']

# Sort the scores and labels in descending order

sorted_scores, sorted_labels = zip(*sorted(zip(scores, labels), reverse=True))

# Set up the plot

fig, ax = plt.subplots(figsize=(12,8))
ax.bar(sorted_labels, sorted_scores, color=colors)

# Set the title and axis labels

ax.set_title('Comparison of Model Performance Grid search', fontsize=12, fontweight='bold')
ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')

# Set the tick font size

ax.tick_params(axis='both', which='major', labelsize=12)

# Add the accuracy score as text above each bar

for i, score in enumerate(sorted_scores):
ax.text(i, score+0.01, f'{score*100:.1f}%', fontsize=12, ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

# Display the plot

plt.show()

**PLOTTING ACCURACY**

import pickle
from sklearn.ensemble import RandomForestClassifier

# Save the Random Forest model as a pickle file

filename = 'random_forest_model.pkl'
pickle.dump(random_forest, open(filename, 'wb'))

print("Random Forest model exported as pickle file:", filename)

725 and 730 Articulated Trucks-Maintenance Intervals
100% (4)
725 and 730 Articulated Trucks-Maintenance Intervals
61 pages
Delhivery Mani
No ratings yet
Delhivery Mani
79 pages
Step-By-Step-Diabetes-Classification-Knn-Detailed-Copy1 - Jupyter Notebook
No ratings yet
Step-By-Step-Diabetes-Classification-Knn-Detailed-Copy1 - Jupyter Notebook
12 pages
1728086737277
No ratings yet
1728086737277
26 pages
Data Visualization
No ratings yet
Data Visualization
159 pages
Data_Analyzer
No ratings yet
Data_Analyzer
10 pages
Ai in HC - 2
No ratings yet
Ai in HC - 2
9 pages
BDA Project Codes
No ratings yet
BDA Project Codes
20 pages
ML 7
No ratings yet
ML 7
6 pages
ml lab
No ratings yet
ml lab
14 pages
My Code
No ratings yet
My Code
7 pages
Empirical Crop Suitability Model 1694688954
No ratings yet
Empirical Crop Suitability Model 1694688954
24 pages
Mastering Data Visualization Techniques
No ratings yet
Mastering Data Visualization Techniques
159 pages
Mastering Data Visualization Techniques 1728896857
No ratings yet
Mastering Data Visualization Techniques 1728896857
85 pages
Mastering Data Visualization Techniques (Part 1)
No ratings yet
Mastering Data Visualization Techniques (Part 1)
20 pages
Lab 2
No ratings yet
Lab 2
8 pages
Heart Attacks Analysis
No ratings yet
Heart Attacks Analysis
10 pages
Titanic Survival Prediction Ml
No ratings yet
Titanic Survival Prediction Ml
36 pages
Datascience 2 PDF
No ratings yet
Datascience 2 PDF
24 pages
Razi AML Assignment2
No ratings yet
Razi AML Assignment2
18 pages
Stroke Prediction
No ratings yet
Stroke Prediction
10 pages
Lab Manual - MachineLearningLaboratory-DR.vaishnavi (1)
No ratings yet
Lab Manual - MachineLearningLaboratory-DR.vaishnavi (1)
71 pages
ML Journal
No ratings yet
ML Journal
37 pages
Logistic Regression With Pyspark
No ratings yet
Logistic Regression With Pyspark
19 pages
DSA_1
No ratings yet
DSA_1
8 pages
DATA SCIENCE AsSIGNMENT - Ipynb - Colab
No ratings yet
DATA SCIENCE AsSIGNMENT - Ipynb - Colab
4 pages
Mayank Chaudhary DEV Practicals
No ratings yet
Mayank Chaudhary DEV Practicals
14 pages
utf-8''C2M1 Assignment
No ratings yet
utf-8''C2M1 Assignment
24 pages
ml_labmanual (3)
No ratings yet
ml_labmanual (3)
33 pages
batch1 ds
No ratings yet
batch1 ds
15 pages
5
No ratings yet
5
5 pages
Diabetes_Prediction_1704256341
No ratings yet
Diabetes_Prediction_1704256341
17 pages
Final Group Project
No ratings yet
Final Group Project
26 pages
PythonForMachineLearning
No ratings yet
PythonForMachineLearning
66 pages
Sample
No ratings yet
Sample
1 page
ModuleAr Merged
No ratings yet
ModuleAr Merged
42 pages
Anemia Word
No ratings yet
Anemia Word
7 pages
Data Visualization
No ratings yet
Data Visualization
70 pages
Print Print Print Print: Import As
No ratings yet
Print Print Print Print: Import As
6 pages
Roll NO 2020
No ratings yet
Roll NO 2020
8 pages
DSBDA2
No ratings yet
DSBDA2
6 pages
ML Lab Codes
No ratings yet
ML Lab Codes
14 pages
python pandas
No ratings yet
python pandas
13 pages
ML Manual Final
No ratings yet
ML Manual Final
35 pages
Data science and analtics Laboratory
No ratings yet
Data science and analtics Laboratory
21 pages
C2M2 - Assignment: 1 Risk Models Using Tree-Based Models
100% (1)
C2M2 - Assignment: 1 Risk Models Using Tree-Based Models
38 pages
Week1 Code Corrected
No ratings yet
Week1 Code Corrected
2 pages
ML Practical 04
No ratings yet
ML Practical 04
20 pages
Data Science With Python
No ratings yet
Data Science With Python
12 pages
Machine Learning Lab Manual (1)
No ratings yet
Machine Learning Lab Manual (1)
42 pages
# Load Packages: Pandas Pandas PD PD Numpy Numpy NP NP
No ratings yet
# Load Packages: Pandas Pandas PD PD Numpy Numpy NP NP
17 pages
Formulario - EA
No ratings yet
Formulario - EA
6 pages
Cardiovascular_Disease_Prediction
No ratings yet
Cardiovascular_Disease_Prediction
2 pages
PRJ-Parkinsons Disease Prediction
No ratings yet
PRJ-Parkinsons Disease Prediction
16 pages
Hariks
No ratings yet
Hariks
5 pages
MACHINE LEARNING manual
No ratings yet
MACHINE LEARNING manual
36 pages
Cardio Screen RF
100% (1)
Cardio Screen RF
27 pages
Unit 5 Descriptive Statistics
No ratings yet
Unit 5 Descriptive Statistics
7 pages
Data Visulization Notes
No ratings yet
Data Visulization Notes
3 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Modern Information Retrieval: Modeling
No ratings yet
Modern Information Retrieval: Modeling
197 pages
Unit V
No ratings yet
Unit V
14 pages
Array Implementation of List ADT
No ratings yet
Array Implementation of List ADT
5 pages
Advent of Europeans
No ratings yet
Advent of Europeans
2 pages
SPM UNIT I Problem
No ratings yet
SPM UNIT I Problem
33 pages
Unit 5
No ratings yet
Unit 5
22 pages
FC-40 Installation Instructions: Automatic-Matching 200-Memory Antenna Tuner
No ratings yet
FC-40 Installation Instructions: Automatic-Matching 200-Memory Antenna Tuner
2 pages
Professional Development Plan: GOALS - Next 5 Years
No ratings yet
Professional Development Plan: GOALS - Next 5 Years
3 pages
d2 Drive User Manual
No ratings yet
d2 Drive User Manual
264 pages
Relés de Secuencia Trifásica y Pérdida de Fase
No ratings yet
Relés de Secuencia Trifásica y Pérdida de Fase
3 pages
How - To - Use - Guide - RemedyAPI - 1 4
No ratings yet
How - To - Use - Guide - RemedyAPI - 1 4
10 pages
Assessment Pack - ICTWEB201 - OD
No ratings yet
Assessment Pack - ICTWEB201 - OD
10 pages
Zero Energy Building by Solar Trigeneration'
No ratings yet
Zero Energy Building by Solar Trigeneration'
17 pages
1 - Maths Activity Mats Pack 1 - Lowest Ability - Interactive
No ratings yet
1 - Maths Activity Mats Pack 1 - Lowest Ability - Interactive
6 pages
Revised 4 C Btech 2 4 6 7 8 July 2022 26722
No ratings yet
Revised 4 C Btech 2 4 6 7 8 July 2022 26722
9 pages
Aircraft Profile 186 - Canadair Sabre
100% (1)
Aircraft Profile 186 - Canadair Sabre
12 pages
Tws Review
No ratings yet
Tws Review
49 pages
INFS1602 Assignment
No ratings yet
INFS1602 Assignment
17 pages
ThirdYear CSBS-2023
No ratings yet
ThirdYear CSBS-2023
87 pages
Artificial Intelligence for Games Companion CD ROM Ian Millington - The full ebook with all chapters is available for download
100% (2)
Artificial Intelligence for Games Companion CD ROM Ian Millington - The full ebook with all chapters is available for download
81 pages
First Trainer 2
No ratings yet
First Trainer 2
257 pages
01 Handout 1
No ratings yet
01 Handout 1
3 pages
Maintenance Manual 1T-4T (LPG) - EN 英语
No ratings yet
Maintenance Manual 1T-4T (LPG) - EN 英语
128 pages
OM DS TopSens-Series E 42364
No ratings yet
OM DS TopSens-Series E 42364
8 pages
Agosto 2022.COMPUTERS FOR THE DISABLED
No ratings yet
Agosto 2022.COMPUTERS FOR THE DISABLED
2 pages
Six Sigma Training Presentation
No ratings yet
Six Sigma Training Presentation
92 pages
Spanning Tree Protocols: STP, RSTP, and MSTP: Feature Overview and Configuration Guide
No ratings yet
Spanning Tree Protocols: STP, RSTP, and MSTP: Feature Overview and Configuration Guide
21 pages
2013 SAND GridPV Toolbox
No ratings yet
2013 SAND GridPV Toolbox
134 pages
Error Detection and Correction Activity
No ratings yet
Error Detection and Correction Activity
1 page
C++ STD Move and STD Forward
No ratings yet
C++ STD Move and STD Forward
11 pages
Modbus Protocle
No ratings yet
Modbus Protocle
10 pages
Project Sample
No ratings yet
Project Sample
29 pages
WristSense framework
No ratings yet
WristSense framework
19 pages
GROUP 05 Communicating Risk and Disasters Through TRADITIONAL MEDIA
No ratings yet
GROUP 05 Communicating Risk and Disasters Through TRADITIONAL MEDIA
69 pages
CWB1860MVS GB
No ratings yet
CWB1860MVS GB
2 pages