MLAll Practical
MLAll Practical
Practical 1
Implement and demonstrate the FIND-S algorithm for finding the most
specific hypothesis based on a given set of training data samples. Read the
training data from a .CSV file.
Data set:
Sky AirTemp Humidity Wind Water Forecast EnjoySport
1 Sunny Warm Normal Strong Warm Same Yes
2 Sunny Warm High Strong Warm Same Yes
3 Rainy Cold High Strong Warm Change No
4 Sunny Warm High Strong Cool Change Yes
Code:
import pandas as pd
import numpy as np
data = pd.read_csv("pr1.csv")
print("Given Dataset:")
print(data)
d = np.array(data)[:,:-1]
print("\n n The attributes are: \n",d)
target = np.array(data)[:,-1]
print("\n n The target is: \n",target)
def train(c,t):
for i, val in enumerate(t):
if val == "Yes":
specific_hypothesis = c[i].copy()
break
for i, val in enumerate(c):
if t[i] == "Yes":
for x in range(len(specific_hypothesis)):
if val[x] != specific_hypothesis[x]:
specific_hypothesis[x] = '?'
else:
pass
return specific_hypothesis
print("\n n The final hypothesis is: \n",train(d,target))
190760107055 1
Machine Learning [3170724]
Output:
190760107055 2
Machine Learning [3170724]
Practical 2
For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of
the set of all hypotheses consistent with the training examples.
Data set:
Sky AirTemp Humidity Wind Water Forecast EnjoySport
1 Sunny Warm Normal Strong Warm Same Yes
2 Sunny Warm High Strong Warm Same Yes
3 Rainy Cold High Strong Warm Change No
4 Sunny Warm High Strong Cool Change Yes
Code:
import numpy as np
import pandas as pd
data = pd.DataFrame(data=pd.read_csv('pr2.csv'))
print("Given Dataset:")
print(data)
concepts = np.array(data.iloc[:,0:-1])
print("\n n The attributes are:")
print(concepts)
target = np.array(data.iloc[:,-1])
print("\n n The target is:")
print(target)
def learn(concepts, target):
specific_h = concepts[0].copy()
print("\nInitialization of specific_h and general_h")
print(specific_h)
general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
print(general_h)
for i, h in enumerate(concepts):
if target[i] == "Yes":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
specific_h[x] = '?'
general_h[x][x] = '?'
if target[i] == "No":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
190760107055 3
Machine Learning [3170724]
general_h[x][x] = specific_h[x]
else:
general_h[x][x] = '?'
print("\nSteps of Candidate Elimination Algorithm",i+1)
print(specific_h)
print(general_h)
indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
for i in indices:
general_h.remove(['?', '?', '?', '?', '?', '?'])
return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("\nFinal Specific_h:", s_final, sep="\n")
print("\nFinal General_h:", g_final, sep="\n")
Output:
190760107055 4
Machine Learning [3170724]
190760107055 5
Machine Learning [3170724]
Practical 3
Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and
apply this knowledge to classify a new sample.
Data set:
Day Outlook Temperature Humidity Wind PlayTennis
D1 Sunny Hot High Weak No
D2 Sunny Hot High Strong No
D3 Overcast Hot High Weak Yes
D4 Rain Mild High Weak Yes
D5 Rain Cool Normal Weak Yes
D6 Rain Cool Normal Strong No
D7 Overcast Cool Normal Strong Yes
D8 Sunny Mild High Weak No
D9 Sunny Cool Normal Weak Yes
D10 Rain Mild Normal Weak Yes
D11 Sunny Mild Normal Strong Yes
D12 Overcast Mild High Strong Yes
D13 Overcast Hot Normal Weak Yes
D14 Rain Mild High Strong No
Code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import copy
dataset = pd.read_csv('pr3.csv')
X = dataset.iloc[:, 1:].values
attribute = ['outlook', 'temp', 'humidity', 'wind']
class Node(object):
def __init__(self):
self.value = None
self.decision = None
self.childs = None
def findEntropy(data, rows):
yes = 0
no = 0
190760107055 6
Machine Learning [3170724]
ans = -1
idx = len(data[0]) - 1
entropy = 0
for i in rows:
if data[i][idx] == 'Yes':
yes = yes + 1
else:
no = no + 1
x = yes/(yes+no)
y = no/(yes+no)
if x != 0 and y != 0:
entropy = -1 * (x*math.log2(x) + y*math.log2(y))
if x == 1:
ans = 1
if y == 1:
ans = 0
return entropy, ans
def findMaxGain(data, rows, columns):
maxGain = 0
retidx = -1
entropy, ans = findEntropy(data, rows)
if entropy == 0:
"""if ans == 1:
print("Yes")
else:
print("No")"""
return maxGain, retidx, ans
for j in columns:
mydict = {}
idx = j
for i in rows:
key = data[i][idx]
if key not in mydict:
mydict[key] = 1
else:
mydict[key] = mydict[key] + 1
gain = entropy
for key in mydict:
yes = 0
no = 0
190760107055 7
Machine Learning [3170724]
for k in rows:
if data[k][j] == key:
if data[k][-1] == 'Yes':
yes = yes + 1
else:
no = no + 1
x = yes/(yes+no)
y = no/(yes+no)
if x != 0 and y != 0:
gain += (mydict[key] * (x*math.log2(x) + y*math.log2(y)))/14
if gain > maxGain:
maxGain = gain
retidx = j
return maxGain, retidx, ans
def buildTree(data, rows, columns):
maxGain, idx, ans = findMaxGain(X, rows, columns)
root = Node()
root.childs = []
if maxGain == 0:
if ans == 1:
root.value = 'Yes'
else:
root.value = 'No'
return root
root.value = attribute[idx]
mydict = {}
for i in rows:
key = data[i][idx]
if key not in mydict:
mydict[key] = 1
else:
mydict[key] += 1
newcolumns = copy.deepcopy(columns)
newcolumns.remove(idx)
for key in mydict:
newrows = []
for i in rows:
if data[i][idx] == key:
newrows.append(i)
temp = buildTree(data, newrows, newcolumns)
190760107055 8
Machine Learning [3170724]
temp.decision = key
root.childs.append(temp)
return root
def traverse(root):
print(root.decision)
print(root.value)
n = len(root.childs)
if n > 0:
for i in range(0, n):
traverse(root.childs[i])
def calculate():
rows = [i for i in range(0, 14)]
columns = [i for i in range(0, 4)]
root = buildTree(X, rows, columns)
root.decision = 'Start'
traverse(root)
calculate()
Output:
190760107055 9
Machine Learning [3170724]
Practical 4
Build an Artificial Neural Network by implementing the Backpropagation
algorithm and test the same using appropriate data sets.
Data set:
X1 X2 Y
2.7810836 2.550537003 0
1.465489372 2.362125076 0
3.396561688 4.400293529 0
1.38807019 1.850220317 0
3.06407232 3.005305973 0
7.627531214 2.759262235 1
5.332441248 2.088626775 1
6.922596716 1.77106367 1
8.675418651 -0.242068655 1
7.673756466 3.508563011 1
Code:
from math import exp
from random import seed
from random import random
def initialize_network(n_inputs, n_hidden, n_outputs):
network = list()
hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in
range(n_hidden)]
network.append(hidden_layer)
output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in
range(n_outputs)]
network.append(output_layer)
return network
def activate(weights, inputs):
activation = weights[-1]
for i in range(len(weights)-1):
activation += weights[i] * inputs[i]
return activation
def transfer(activation):
return 1.0 / (1.0 + exp(-activation))
def forward_propagate(network, row):
inputs = row
190760107055 10
Machine Learning [3170724]
190760107055 11
Machine Learning [3170724]
190760107055 12
Machine Learning [3170724]
Output:
190760107055 13
Machine Learning [3170724]
Practical 5
Write a program to implement the naïve Bayesian classifier for a sample
training data set stored as a .CSV file. Compute the accuracy of the classifier,
considering few test data sets.
Data set:
Day Outlook Temperature Humidity Wind PlayTennis
D1 Sunny Hot High False No
D2 Sunny Hot High True No
D3 Overcast Hot High False Yes
D4 Rainy Mild High False Yes
D5 Rainy Cool Normal False Yes
D6 Rainy Cool Normal True No
D7 Overcast Cool Normal True Yes
D8 Sunny Mild High False No
D9 Sunny Cool Normal False Yes
D10 Rainy Mild Normal False Yes
D11 Sunny Mild Normal True Yes
D12 Overcast Mild High True Yes
D13 Overcast Hot Normal False Yes
D14 Rainy Mild High True No
Code:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
data = pd.read_csv('pr5.csv')
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
le_outlook = LabelEncoder()
X.Outlook = le_outlook.fit_transform(X.Outlook)
le_Temperature = LabelEncoder()
X.Temperature = le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity = le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy = le_Windy.fit_transform(X.Windy)
print("\nNow the Train data is :\n",X)
190760107055 14
Machine Learning [3170724]
le_PlayTennis = LabelEncoder()
y = le_PlayTennis.fit_transform(y)
print("\nNow the Train output is\n",y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20)
classifier = GaussianNB()
classifier.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
print("Accuracy is:",accuracy_score(classifier.predict(X_test),y_test))
Output:
190760107055 15
Machine Learning [3170724]
Practical 6
Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to perform this task. Built-in Java classes/API can be used to
write the program. Calculate the accuracy, precision, and recall for your data
set.
Data set:
Text Documents Label
1 I love this sandwich pos
2 This is an amazing place pos
3 I feel very good about these beers pos
4 This is my best work pos
5 What an awesome view pos
6 I do not like this restaurant neg
7 I am tired of this stuff neg
8 I can’t deal with this neg
9 He is my sworn enemy neg
10 My boss is horrible neg
11 This is an awesome place pos
12 I do not like the taste of this juice neg
13 I love to dance pos
14 I am sick and tired of this place neg
15 What a great holiday pos
16 That is a bad locality to stay neg
17 We will have good fun tomorrow pos
18 I went to my enemy’s house today neg
Code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
msg=pd.read_csv('pr6.csv',names=['message','label'])
print('The dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
190760107055 16
Machine Learning [3170724]
y=msg.labelnum
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
(xtest.shape)
(xtrain.shape)
print ('\nThe total number of Training Data :',ytrain.shape)
print ('\nThe total number of Test Data :',ytest.shape)
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
print('\nAccuracy metrics:')
print('Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix:')
print(metrics.confusion_matrix(ytest,predicted))
print("\nRecall:",metrics.recall_score(ytest,predicted))
print("\nPrecision:",metrics.precision_score(ytest,predicted))
Output:
190760107055 17
Machine Learning [3170724]
Practical 7
Write a program to construct a Bayesian network considering medical data.
Use this model to demonstrate the diagnosis of heart patients using standard
Heart Disease Data Set. You can use Java/Python ML library classes/API
Data set:
Age Gender Family Diet Lifestyle Cholestrol HeartDisease
1 0 0 1 1 3 0 1
2 0 1 1 1 3 0 1
3 1 0 0 0 2 1 1
4 4 0 1 1 3 2 0
5 3 1 1 0 0 2 0
6 2 0 1 1 1 0 1
7 4 0 1 0 2 0 1
8 0 0 1 1 3 0 1
9 3 1 1 0 0 2 0
10 1 1 0 0 0 2 1
11 4 1 0 1 2 0 1
12 4 0 1 1 3 2 0
13 2 1 0 0 0 0 0
14 2 0 1 1 1 0 1
15 3 1 1 0 0 1 0
16 0 0 1 0 0 2 1
17 1 1 0 1 2 1 1
18 3 1 1 1 0 1 0
19 4 0 1 1 3 2 0
Code:
import pandas as pd
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
data = pd.read_csv("pr7.csv")
heart_disease = pd.DataFrame(data)
model = BayesianModel([
('age', 'Lifestyle'),
('Gender', 'Lifestyle'),
('Family', 'heartdisease'),
('diet', 'cholestrol'),
190760107055 18
Machine Learning [3170724]
('Lifestyle', 'diet'),
('cholestrol', 'heartdisease'),
('diet', 'cholestrol')
])
model.fit(heart_disease, estimator=MaximumLikelihoodEstimator)
HeartDisease_infer = VariableElimination(model)
print('For Age enter SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3,
Teen:4')
print('For Gender enter Male:0, Female:1')
print('For Family History enter Yes:1, No:0')
print('For Diet enter High:0, Medium:1')
print('for LifeStyle enter Athlete:0, Active:1, Moderate:2, Sedentary:3')
print('for Cholesterol enter High:0, BorderLine:1, Normal:2')
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={
'age': int(input('Enter Age: ')),
'Gender': int(input('Enter Gender: ')),
'Family': int(input('Enter Family History: ')),
'diet': int(input('Enter Diet: ')),
'Lifestyle': int(input('Enter Lifestyle: ')),
'cholestrol': int(input('Enter Cholestrol: '))
})
print(q)
Output:
190760107055 19
Machine Learning [3170724]
190760107055 20
Machine Learning [3170724]
Practical 8
Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data set for clustering using k-Means algorithm. Compare the results of these
two algorithms and comment on the quality of clustering. You can add
Java/Python ML library classes/API in the program.
Data set:
Iris Dataset
Code:
import csv
from collections import Counter
import numpy as np
import sklearn.metrics as sm
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
class_dict = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
with open('pr8.csv') as csvFile:
dataset = [line for line in csv.reader(csvFile)]
dataset = dataset[1:]
X = []
y = []
for line in dataset:
X.append(line[:-1])
y.append(class_dict[line[-1]])
X = np.array(X).astype(float)
y = np.array(y).astype(int)
def rename_clusters(s):
cnt = Counter((c1, c2) for c1, c2 in zip(s, y))
most_common = cnt.most_common()
map_dict = {}
for tup in most_common:
if not tup[0][0] in map_dict:
map_dict[tup[0][0]] = tup[0][1]
for i in range(len(s)):
s[i] = map_dict[s[i]]
return s
# EM part
gmm = GaussianMixture(n_components=3)
190760107055 21
Machine Learning [3170724]
gmm.fit(X)
y_kmeans = gmm.predict(X)
em = rename_clusters(y_kmeans)
plt.scatter(X[:, 0], X[:, 1], c=em, s=40, cmap='viridis')
print("Accuracy EM : ", sm.accuracy_score(y, em))
# K-means part
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
km = rename_clusters(y_kmeans)
plt.scatter(X[:, 0], X[:, 1], c=km, s=40, cmap='viridis')
print("Accuracy KM : ", sm.accuracy_score(y, km))
Output:
190760107055 22
Machine Learning [3170724]
Practical 9
Write a program to implement k-Nearest Neighbor algorithm to classify the
iris data set. Print both correct and wrong predictions. Java/Python ML
library classes can be used for this problem.
Data set:
Iris Dataset
Code:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
class_dict = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
with open('pr9.csv') as csvFile:
dataset = [line for line in csv.reader(csvFile)]
dataset = dataset[1:]
X = []
y = []
for line in dataset:
X.append(line[:-1])
y.append(class_dict[line[-1]])
X = np.array(X).astype(float)
y = np.array(y).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(class_dict)
print("X", "y_actual", "y_predicted", "is_correct")
for _x, _ya, _yp in zip(X_test, y_test, y_pred):
print(_x, _ya, _yp, _ya == _yp)
190760107055 23
Machine Learning [3170724]
Output:
190760107055 24
Machine Learning [3170724]
Practical 10
Implement the non-parametric Locally Weighted Regression algorithm in
order to fit data points. Select appropriate data set for your experiment and
draw graphs.
Data set:
Dynamic Dataset
Code:
import numpy as np
import matplotlib.pyplot as plt
def radial_kernel(x0, X, tau):
return np.exp(np.sum((X - x0) ** 2, axis=1) / (-2 * tau * tau))
def local_regression(x0, X, Y, tau):
x0 = np.r_[1, x0]
X = np.c_[np.ones(len(X)), X]
xw = X.T * radial_kernel(x0, X, tau)
beta = np.linalg.pinv(xw @ X) @ xw @ Y
return x0 @ beta
n = 1000
# Generate dataset
X = np.linspace(-3, 3, num=n)
print("The Data Set ( 10 Samples) X:\n", X[1:10])
Y = np.log(np.abs(X ** 2 - 1) + .5)
print("The Fitting Curve Data Set (10 Samples) Y:\n", Y[1:10])
# Jitter X
X += np.random.normal(scale=.1, size=n)
print("Jitter (10 Samples) X :\n", X[1:10])
domain = np.linspace(-3, 3, num=300)
print(" Xo Domain Space(10 Samples):\n", domain[1:10])
def plot_lwr(tau):
predictions = [local_regression(x0, X, Y, tau) for x0 in domain]
plt.scatter(X, Y, color='blue', alpha=0.3, s=20)
plt.plot(domain, predictions, color='red', linewidth=3)
plt.show()
plot_lwr(10.)
plot_lwr(1.)
plot_lwr(0.1)
plot_lwr(0.01)
190760107055 25
Machine Learning [3170724]
Output:
190760107055 26
Machine Learning [3170724]
190760107055 27