0% found this document useful (0 votes)

20 views7 pages

Implementing KNN Algorithm On The Iris Dataset

Uploaded by

chatborg

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

20 views7 pages

Implementing KNN Algorithm On The Iris Dataset

Uploaded by

chatborg

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 7

John Ndungu / Implementing KNN Algorithm on the Iris Dataset

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets

from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

#EDA on Iris Dataset

We are going to use a very famous dataset called Iris.
Attributes:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
We will just use two features for easier visualization, sepal length and width.
Class:
Iris Setosa
Iris Versicolour
Iris Virginica
#Load the Dataset

# import iris dataset

iris = datasets.load_iris()
# np.c_ is the numpy concatenate function
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
columns= iris['feature_names'] + ['target'])
iris_df.head()

sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target

0 5.1 3.5 1.4 0.2 0.0

1 4.9 3.0 1.4 0.2 0.0

2 4.7 3.2 1.3 0.2 0.0

3 4.6 3.1 1.5 0.2 0.0

4 5.0 3.6 1.4 0.2 0.0

#Describe the Dataset

iris_df.describe()

sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target

count 150.000000 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.057333 3.758000 1.199333 1.000000

std 0.828066 0.435866 1.765298 0.762238 0.819232

min 4.300000 2.000000 1.000000 0.100000 0.000000

25% 5.100000 2.800000 1.600000 0.300000 0.000000

50% 5.800000 3.000000 4.350000 1.300000 1.000000

75% 6.400000 3.300000 5.100000 1.800000 2.000000

max 7.900000 4.400000 6.900000 2.500000 2.000000

#Split into X and Y

x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]

x.head()

y.head()

0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
Name: target, dtype: float64

#Split into training and testing

# split the data into train and test sets

x_train, x_test, y_train, y_test= train_test_split(x, y,
test_size= 0.2,
shuffle= True, #shuffle the data to avoid bias
random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)

print(f'training set size: {x_train.shape[0]} samples \ntest set size: {x_test.shape[0]} samples')

training set size: 120 samples

test set size: 30 samples

#Normalize the Dataset

scaler= Normalizer().fit(x_train) # the scaler is fitted to the training set

normalized_x_train= scaler.transform(x_train) # the scaler is applied to the training set
normalized_x_test= scaler.transform(x_test) # the scaler is applied to the test set
print('x train before Normalization')
print(x_train[0:5])
print('\nx train after Normalization')
print(normalized_x_train[0:5])

x train before Normalization

[[6.4 3.1 5.5 1.8]
[5.4 3. 4.5 1.5]
[5.2 3.5 1.5 0.2]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.2]]

x train after Normalization

[[0.69804799 0.338117 0.59988499 0.196326 ]
[0.69333409 0.38518561 0.57777841 0.1925928 ]
[0.80641965 0.54278246 0.23262105 0.03101614]
[0.71171214 0.35002236 0.57170319 0.21001342]
[0.69417747 0.30370264 0.60740528 0.2386235 ]]

#Visualize the Dataset before and after Normalization

## Before
# View the relationships between variables; color code by species type
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0:'Virginica'} # dictionary

before= sns.pairplot(iris_df.replace({'target': di}), hue= 'target')

before.fig.suptitle('Pair Plot of the dataset Before normalization', y=1.08)

## After
iris_df_2= pd.DataFrame(data= np.c_[normalized_x_train, y_train],
columns= iris['feature_names'] + ['target'])
di= {0.0: 'Setosa', 1.0: 'Versicolor', 2.0: 'Virginica'}
after= sns.pairplot(iris_df_2.replace({'target':di}), hue= 'target')
after.fig.suptitle('Pair Plot of the dataset After normalization', y=1.08)

Text(0.5, 1.08, 'Pair Plot of the dataset After normalization')

#KNN Step 1 (Euclidean Distance)

def distance_ecu(x_train, x_test_point):

"""
Input:
- x_train: corresponding to the training data
- x_test_point: corresponding to the test point

Output:
-distances: The distances between the test point and each point in the training data.

"""
distances= [] ## create empty list called distances
for row in range(len(x_train)): ## Loop over the rows of x_train
current_train_point= x_train[row] #Get them point by point
current_distance= 0 ## initialize the distance by zero

for col in range(len(current_train_point)): ## Loop over the columns of the row

current_distance += (current_train_point[col] - x_test_point[col]) **2

## Or current_distance = current_distance + (x_train[i] - x_test_point[i])**2
current_distance= np.sqrt(current_distance)

distances.append(current_distance) ## Append the distances

# Store distances in a dataframe

distances= pd.DataFrame(data=distances,columns=['dist'])
return distances

#KNN Step 2 (Find the nearest neighbors)

def nearest_neighbors(distance_point, K):
"""
Input:
-distance_point: the distances between the test point and each point in the training data.
-K : the number of neighbors

Output:
-df_nearest: the nearest K neighbors between the test point and the training data.

"""

# Sort values using the sort_values function

df_nearest= distance_point.sort_values(by=['dist'], axis=0)

## Take only the first K neighbors

df_nearest= df_nearest[:K]
return df_nearest

#KNN Step 3 (Classify the point based on a majority vote)

def voting(df_nearest, y_train):

"""
Input:
-df_nearest: dataframe contains the nearest K neighbors between the full training dataset and the test point.
-y_train: the labels of the training dataset.

Output:
-y_pred: the prediction based on Majority Voting

"""

## Use the Counter Object to get the labels with K nearest neighbors.
counter_vote= Counter(y_train[df_nearest.index])

y_pred= counter_vote.most_common()[0][0] # Majority Voting

return y_pred

#KNN Full Algorithm: Putting Everything Together

def KNN_from_scratch(x_train, y_train, x_test, K):

"""
Input:
-x_train: the full training dataset
-y_train: the labels of the training dataset
-x_test: the full test dataset
-K: the number of neighbors

Output:
-y_pred: the prediction for the whole test set based on Majority Voting.

"""

y_pred=[]

## Loop over all the test set and perform the three steps
for x_test_point in x_test:
distance_point = distance_ecu(x_train, x_test_point) ## Step 1
df_nearest_point= nearest_neighbors(distance_point, K) ## Step 2
y_pred_point = voting(df_nearest_point, y_train) ## Step 3
y_pred.append(y_pred_point)

return y_pred

#Test the KNN Algorithm on the test dataset

K=3
y_pred_scratch= KNN_from_scratch(normalized_x_train, y_train, normalized_x_test, K)
print(y_pred_scratch)

[2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0,
#Compare our implementation with Sklearn library

knn=KNeighborsClassifier(K)
knn.fit(normalized_x_train, y_train)
y_pred_sklearn= knn.predict(normalized_x_test)
print(y_pred_sklearn)

[2. 1. 0. 2. 0. 2. 0. 1. 1. 1. 2. 1. 1. 1. 1. 0. 1. 2. 0. 0. 2. 1. 0. 0.
2. 0. 0. 1. 1. 0.]

#Check if the output is exactly the same

print(np.array_equal(y_pred_sklearn, y_pred_scratch))

True

#Calculate the accuracy of both methods

print(f'The accuracy of our implementation is {accuracy_score(y_test, y_pred_scratch)}')

print(f'The accuracy of sklearn implementation is {accuracy_score(y_test, y_pred_sklearn)}')

The accuracy of our implementation is 0.9666666666666667

The accuracy of sklearn implementation is 0.9666666666666667

#Perform Hyper-parameter Tuning using K-fold Cross Validation

n_splits= 4 ## Choose the number of splits

kf= KFold(n_splits= n_splits) ## Call the K Fold function

accuracy_k= [] ## Keep track of the accuracy for each K

k_values= list(range(1,30,2)) ## Search for the best value of K

for k in k_values: ## Loop over the K values

accuracy_fold= 0
for normalized_x_train_fold_idx, normalized_x_valid_fold_idx in kf.split(normalized_x_train): ## Loop over the splits
normalized_x_train_fold= normalized_x_train[normalized_x_train_fold_idx] ## fetch the values
y_train_fold= y_train[normalized_x_train_fold_idx]

normalized_x_test_fold= normalized_x_train[normalized_x_valid_fold_idx]
y_valid_fold= y_train[normalized_x_valid_fold_idx]
y_pred_fold= KNN_from_scratch(normalized_x_train_fold, y_train_fold, normalized_x_test_fold, k)

accuracy_fold+= accuracy_score (y_pred_fold, y_valid_fold) ## Accumulate the accuracy

accuracy_fold= accuracy_fold/ n_splits ## Divide by the number of splits
accuracy_k.append(accuracy_fold)

print(f'The accuracy for each K value was {list ( zip (accuracy_k, k_values))}') ## creates a tuple with accuracy corresp

The accuracy for each K value was [(0.9666666666666668, 1), (0.9666666666666668, 3), (0.9666666666666668, 5), (0.9666666666666668, 7), (0.958

print(f'Best accuracy was {np.max(accuracy_k)}, which corresponds to a value of K= {k_values[np.argmax(accuracy_k)]}')

Best accuracy was 0.9666666666666668, which corresponds to a value of K= 1

CRTP Command Checklists
No ratings yet
CRTP Command Checklists
6 pages
Orchestral Tools - The Orchestral Grands Manual
No ratings yet
Orchestral Tools - The Orchestral Grands Manual
12 pages
LAB-4 Report
No ratings yet
LAB-4 Report
21 pages
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
No ratings yet
Assignment #1: K Nearest Neighbor Classifier: Name: Srikanth Mujjiga (Roll No: 2015-50-831
8 pages
Rahul Raj - Ipynb - Colab
No ratings yet
Rahul Raj - Ipynb - Colab
50 pages
Machine Learning Lab
No ratings yet
Machine Learning Lab
33 pages
ML Experiment WithDataset
No ratings yet
ML Experiment WithDataset
23 pages
V
No ratings yet
V
8 pages
DM ML Practical
No ratings yet
DM ML Practical
13 pages
Assignment 4
No ratings yet
Assignment 4
9 pages
Mlalllabprgs
No ratings yet
Mlalllabprgs
17 pages
Aam Codes
No ratings yet
Aam Codes
8 pages
ML Lab Manual
No ratings yet
ML Lab Manual
24 pages
Wa0003
No ratings yet
Wa0003
16 pages
Assignment No 2 AI
No ratings yet
Assignment No 2 AI
4 pages
Exercise Final
No ratings yet
Exercise Final
8 pages
ML Journal External
No ratings yet
ML Journal External
14 pages
MLLab Manual
No ratings yet
MLLab Manual
24 pages
ML Lab
No ratings yet
ML Lab
7 pages
ML Programs
No ratings yet
ML Programs
14 pages
Lab Manual
No ratings yet
Lab Manual
9 pages
16BCB0126 VL2018195002535 Pe003
No ratings yet
16BCB0126 VL2018195002535 Pe003
40 pages
Unit2 ML Programs
No ratings yet
Unit2 ML Programs
7 pages
K-Nearest Neighbor: General Gist
No ratings yet
K-Nearest Neighbor: General Gist
14 pages
Lab4 KNN
No ratings yet
Lab4 KNN
9 pages
Practical - 5 - 52
No ratings yet
Practical - 5 - 52
4 pages
EX - NO:3: Algorithm
No ratings yet
EX - NO:3: Algorithm
11 pages
Mlda - Lab
No ratings yet
Mlda - Lab
35 pages
DL Exp-1.4 19BCS1431
No ratings yet
DL Exp-1.4 19BCS1431
5 pages
KNN - Predictive Analysis
No ratings yet
KNN - Predictive Analysis
6 pages
Worksheet - 2.3 20BCS7490
No ratings yet
Worksheet - 2.3 20BCS7490
6 pages
Program 4
No ratings yet
Program 4
3 pages
Lab Session 9
No ratings yet
Lab Session 9
2 pages
K Nearest Neighbors
No ratings yet
K Nearest Neighbors
5 pages
Python For Data Science IA 1 Programs
No ratings yet
Python For Data Science IA 1 Programs
14 pages
ML 5
No ratings yet
ML 5
2 pages
Machine Learning Lab Manual
No ratings yet
Machine Learning Lab Manual
9 pages
Worksheet - 2.3 20BCS7611
No ratings yet
Worksheet - 2.3 20BCS7611
6 pages
Program 4
No ratings yet
Program 4
3 pages
4K-Nearest Neighbor
No ratings yet
4K-Nearest Neighbor
38 pages
K-Nearest Neighbor On Python Ken Ocuma
100% (2)
K-Nearest Neighbor On Python Ken Ocuma
9 pages
ML Short Code - Under Updating
No ratings yet
ML Short Code - Under Updating
4 pages
Part A 3. KNN Classification
No ratings yet
Part A 3. KNN Classification
35 pages
Python For Data Science IA 1 Programs
No ratings yet
Python For Data Science IA 1 Programs
14 pages
B-56 Sanket Jambhulkar MLA-7
No ratings yet
B-56 Sanket Jambhulkar MLA-7
9 pages
Machine Learning LAB
No ratings yet
Machine Learning LAB
20 pages
Lecture 12 K-Nearest Neighbors
No ratings yet
Lecture 12 K-Nearest Neighbors
24 pages
Big Data Assignment - 7
No ratings yet
Big Data Assignment - 7
7 pages
Machine Learning Programs
No ratings yet
Machine Learning Programs
10 pages
Lab 8
No ratings yet
Lab 8
7 pages
Week10 KNN Practical
No ratings yet
Week10 KNN Practical
4 pages
Dhanashree ML Report
No ratings yet
Dhanashree ML Report
3 pages
SVM K NN MLP With Sklearn Jupyter NoteBo
No ratings yet
SVM K NN MLP With Sklearn Jupyter NoteBo
22 pages
DSASSign 4
No ratings yet
DSASSign 4
11 pages
Lab 10 - Manual and Assignment On KNN
No ratings yet
Lab 10 - Manual and Assignment On KNN
3 pages
DSM 3
No ratings yet
DSM 3
6 pages
K-Means Clustering From Scratch
No ratings yet
K-Means Clustering From Scratch
3 pages
CP4252 Lab Manual
No ratings yet
CP4252 Lab Manual
13 pages
2 - 9 - KNN Code
No ratings yet
2 - 9 - KNN Code
6 pages
Programs Lab Bca
No ratings yet
Programs Lab Bca
16 pages
Activity 01: Python Set/s of Source Code Use in The Activity (Paste Below)
No ratings yet
Activity 01: Python Set/s of Source Code Use in The Activity (Paste Below)
2 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
AppDev Slide 14 Analytical Apps Developemnt
No ratings yet
AppDev Slide 14 Analytical Apps Developemnt
21 pages
B Tech in Cse With SPL in Internet of Things
No ratings yet
B Tech in Cse With SPL in Internet of Things
4 pages
Lenberg #5 (Security Deficiencies) (051621)
No ratings yet
Lenberg #5 (Security Deficiencies) (051621)
15 pages
Sony VPL-HW40ES SXRD Projector Specs
No ratings yet
Sony VPL-HW40ES SXRD Projector Specs
1 page
P Series Temperature Control System Accessories Catalog - V1.0 - 20221108
No ratings yet
P Series Temperature Control System Accessories Catalog - V1.0 - 20221108
8 pages
PF - 1
No ratings yet
PF - 1
27 pages
Low-Code No-Code Development Reference
No ratings yet
Low-Code No-Code Development Reference
13 pages
Project Report
No ratings yet
Project Report
22 pages
Professional Data Engineer Certification - Learn - Google Cloud
No ratings yet
Professional Data Engineer Certification - Learn - Google Cloud
5 pages
Digital Locker
No ratings yet
Digital Locker
12 pages
Gpma0390 Manual v1 - 1
No ratings yet
Gpma0390 Manual v1 - 1
62 pages
2581A-NEC Photo Coupler
100% (1)
2581A-NEC Photo Coupler
13 pages
Careers at Hyland - Jobs in Software at Hyland - Developer 2 in Kolkata, West Bengal - Careers at Kolkata
No ratings yet
Careers at Hyland - Jobs in Software at Hyland - Developer 2 in Kolkata, West Bengal - Careers at Kolkata
5 pages
SA-02A4 SA-02M Software Instruction Manual
No ratings yet
SA-02A4 SA-02M Software Instruction Manual
130 pages
Ipv6 1 PDF
No ratings yet
Ipv6 1 PDF
35 pages
2.1 Common Network Ports
No ratings yet
2.1 Common Network Ports
2 pages
Module 1 - Intro To GenAI - PEC - Gen - AI - Training
No ratings yet
Module 1 - Intro To GenAI - PEC - Gen - AI - Training
49 pages
Process Log
No ratings yet
Process Log
42 pages
Yama Blox Fruits Wiki Fandom
No ratings yet
Yama Blox Fruits Wiki Fandom
1 page
Module 1
No ratings yet
Module 1
4 pages
DAA Theory Notes
No ratings yet
DAA Theory Notes
31 pages
Quarter 1 Week 4 Mathematics 10: NAME: - YR & SEC: - Competency
No ratings yet
Quarter 1 Week 4 Mathematics 10: NAME: - YR & SEC: - Competency
9 pages
Arensa Prasta: Technical Proficiencies
No ratings yet
Arensa Prasta: Technical Proficiencies
2 pages
Lab1 - Dasari Nikhil Reddy DBMS
No ratings yet
Lab1 - Dasari Nikhil Reddy DBMS
6 pages
SPARC S7 Architecture Assessment
100% (1)
SPARC S7 Architecture Assessment
10 pages
A Developer'S Migration Plan: I D E M P I E R E
No ratings yet
A Developer'S Migration Plan: I D E M P I E R E
23 pages
6.003 Homework #12 Solutions: Problems
No ratings yet
6.003 Homework #12 Solutions: Problems
9 pages
PEP 8 - The Style Guide For Python Code
No ratings yet
PEP 8 - The Style Guide For Python Code
48 pages

Implementing KNN Algorithm On The Iris Dataset

Uploaded by

Implementing KNN Algorithm On The Iris Dataset

Uploaded by

John Ndungu / Implementing KNN Algorithm on the Iris Dataset

import matplotlib.pyplot as plt

from sklearn import datasets

from collections import Counter

#EDA on Iris Dataset

# import iris dataset

0 5.1 3.5 1.4 0.2 0.0

1 4.9 3.0 1.4 0.2 0.0

2 4.7 3.2 1.3 0.2 0.0

3 4.6 3.1 1.5 0.2 0.0

4 5.0 3.6 1.4 0.2 0.0

#Describe the Dataset

count 150.000000 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.057333 3.758000 1.199333 1.000000

std 0.828066 0.435866 1.765298 0.762238 0.819232

min 4.300000 2.000000 1.000000 0.100000 0.000000

25% 5.100000 2.800000 1.600000 0.300000 0.000000

50% 5.800000 3.000000 4.350000 1.300000 1.000000

75% 6.400000 3.300000 5.100000 1.800000 2.000000

max 7.900000 4.400000 6.900000 2.500000 2.000000

#Split into X and Y

#Split into training and testing

# split the data into train and test sets

training set size: 120 samples

#Normalize the Dataset

scaler= Normalizer().fit(x_train) # the scaler is fitted to the training set

x train before Normalization

x train after Normalization

#Visualize the Dataset before and after Normalization

before= sns.pairplot(iris_df.replace({'target': di}), hue= 'target')

Text(0.5, 1.08, 'Pair Plot of the dataset After normalization')

def distance_ecu(x_train, x_test_point):

for col in range(len(current_train_point)): ## Loop over the columns of the row

current_distance += (current_train_point[col] - x_test_point[col]) **2

distances.append(current_distance) ## Append the distances

# Store distances in a dataframe

#KNN Step 2 (Find the nearest neighbors)

# Sort values using the sort_values function

## Take only the first K neighbors

#KNN Step 3 (Classify the point based on a majority vote)

def voting(df_nearest, y_train):

y_pred= counter_vote.most_common()[0][0] # Majority Voting

#KNN Full Algorithm: Putting Everything Together

def KNN_from_scratch(x_train, y_train, x_test, K):

#Test the KNN Algorithm on the test dataset

#Check if the output is exactly the same

#Calculate the accuracy of both methods

print(f'The accuracy of our implementation is {accuracy_score(y_test, y_pred_scratch)}')

The accuracy of our implementation is 0.9666666666666667

#Perform Hyper-parameter Tuning using K-fold Cross Validation

n_splits= 4 ## Choose the number of splits

accuracy_k= [] ## Keep track of the accuracy for each K

for k in k_values: ## Loop over the K values

accuracy_fold+= accuracy_score (y_pred_fold, y_valid_fold) ## Accumulate the accuracy

print(f'Best accuracy was {np.max(accuracy_k)}, which corresponds to a value of K= {k_values[np.argmax(accuracy_k)]}')

Best accuracy was 0.9666666666666668, which corresponds to a value of K= 1

You might also like