Experiment-1: Implement and demonstrate the FIND-S algorithm for finding the most
specific hypothesis based on a given set of training data samples. Read the training data
from a .CSV file.
import csv
num_attributes = 6
a = []
print("\n The Given Training Data Set \n")
with open('ML1&2.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
a.append(row)
print(row)
print("\n The initial value of hypothesis: ")
hypothesis = ['0'] * num_attributes
print(hypothesis)
for j in range(0, num_attributes):
hypothesis[j] = a[0][j]
print("\n Find S: Finding a Maximally Specific Hypothesis\n")
for i in range(0, len(a)):
if a[i][num_attributes] == 'yes':
for j in range(0, num_attributes):
if a[i][j] != hypothesis[j]:
hypothesis[j] = '?'
else:
hypothesis[j] = a[i][j]
print(" For Training instance No:{0} the hypothesis
is".format(i), hypothesis)
print("\n The Maximally Specific Hypothesis for a given
TrainingExamples :\n")
print(hypothesis)
The Given Training Data Set
['sunny', 'warm', 'normal', 'strong', 'warm', 'same', 'yes']
['sunny', 'warm', 'high', 'strong', 'warm', 'same', 'yes']
['rainy', 'cold', 'high', 'strong', 'warm', 'change', 'no']
['sunny', 'warm', 'high', 'strong', 'cool', 'change', 'yes']
The initial value of hypothesis:
['0', '0', '0', '0', '0', '0']
Find S: Finding a Maximally Specific Hypothesis
For Training instance No:0 the hypothesis is ['sunny', 'warm',
'normal', 'strong', 'warm', 'same']
For Training instance No:1 the hypothesis is ['sunny', 'warm', '?',
'strong', 'warm', 'same']
For Training instance No:3 the hypothesis is ['sunny', 'warm', '?',
'strong', '?', '?']
The Maximally Specific Hypothesis for a given TrainingExamples :
['sunny', 'warm', '?', 'strong', '?', '?']
Experiment-2: For a given set of training data examples stored in a .CSV file, implement
and demonstrate the CandidateElimination algorithm to output a description of the set
of all hypotheses consistent with the training examples.
import numpy as np
import pandas as pd
data = pd.DataFrame(data=pd.read_csv('ML1&2.csv'))
concepts = np.array(data.iloc[:, 0:-1])
print(concepts)
target = np.array(data.iloc[:, -1])
print(target)
def learn(concepts, target):
specific_h = concepts[0].copy()
print("Initialization of specific_h and general_h")
print(specific_h)
general_h = [["?" for i in range(len(specific_h))] for i in
range(len(specific_h))]
print(general_h)
for i, h in enumerate(concepts):
if target[i] == "yes":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
specific_h[x] = '?'
general_h[x][x] = '?'
print(specific_h)
print(general_h)
if target[i] == "no":
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
general_h[x][x] = specific_h[x]
else:
general_h[x][x] = '?'
print("Steps of Candidate Elimination Algorithm", i+1)
print(specific_h)
print(general_h)
indices = [i for i, val in enumerate(general_h) if val == ['?',
'?', '?', '?', '?', '?']]
for i in indices:
general_h.remove(['?', '?', '?', '?', '?', '?'])
return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("Final Specific_h:", s_final, sep="\n")
print("Final General_h:", g_final, sep="\n")
[['sunny' 'warm' 'high' 'strong' 'warm' 'same']
['rainy' 'cold' 'high' 'strong' 'warm' 'change']
['sunny' 'warm' 'high' 'strong' 'cool' 'change']]
['yes' 'no' 'yes']
Initialization of specific_h and general_h
['sunny' 'warm' 'high' 'strong' 'warm' 'same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
['sunny' 'warm' 'high' 'strong' 'warm' 'same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'],
['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]
Steps of Candidate Elimination Algorithm 2
['sunny' 'warm' 'high' 'strong' 'warm' 'same']
[['sunny', '?', '?', '?', '?', '?'], ['?', 'warm', '?', '?', '?',
'?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?',
'?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?',
'same']]
['sunny' 'warm' 'high' 'strong' '?' '?']
[['sunny', '?', '?', '?', '?', '?'], ['?', 'warm', '?', '?', '?',
'?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?',
'?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?',
'?']]
Final Specific_h:
['sunny' 'warm' 'high' 'strong' '?' '?']
Final General_h:
[['sunny', '?', '?', '?', '?', '?'], ['?', 'warm', '?', '?', '?',
'?']]
Experiment-3: Write a program to demonstrate the working of the decision tree based
ID3 algorithm. Use an appropriate data set for building the decision tree and apply this
knowledge to classify a new sample
import numpy as np
import math
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
headers = next(datareader)
metadata = []
traindata = []
for name in headers:
metadata.append(name)
for row in datareader:
traindata.append(row)
return (metadata, traindata)
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def __str__(self):
return self.attribute
def subtables(data, col, delete):
dict = {}
items = np.unique(data[:, col])
count = np.zeros((items.shape[0], 1), dtype=np.int32)
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]),
dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
items = np.unique(S)
if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
for count in counts:
sums += -1 * count * math.log(count, 2)
return sums
def gain_ratio(data, col):
items, dict = subtables(data, col, delete=False)
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1))
intrinsic = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1])
iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
node.answer = np.unique(data[:, -1])[0]
return node
gains = np.zeros((data.shape[1] - 1, 1))
for col in range(data.shape[1] - 1):
gains[col] = gain_ratio(data, col)
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
def print_tree(node, level):
if node.answer != "":
print(empty(level), node.answer)
return
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
metadata, traindata = read_data("ML3.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)
Outlook
Overcast
b'Yes'
Rainy
Windy
b'False'
b'Yes'
b'True'
b'No'
Sunny
Humidity
b'High'
b'No'
b'Normal'
b'Yes'
C:\Users\somas\AppData\Local\Temp\ipykernel_11960\3759657849.py:35:
DeprecationWarning: Conversion of an array with ndim > 0 to a scalar
is deprecated, and will error in future. Ensure you extract a single
element from your array before performing this operation.
(Deprecated NumPy 1.25.)
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|
S32")
C:\Users\somas\AppData\Local\Temp\ipykernel_11960\3759657849.py:58:
DeprecationWarning: Conversion of an array with ndim > 0 to a scalar
is deprecated, and will error in future. Ensure you extract a single
element from your array before performing this operation.
(Deprecated NumPy 1.25.)
sums += -1 * count * math.log(count, 2)
Experiment-4: Exercises to solve the real-world problems using the following machine
learning methods:
a) Linear Regression
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('ML4.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
= 1/3, random_state =0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
Experiment-4: Exercises to solve the real-world problems using the following machine
learning methods:
b) Logistic Regression
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('ML4B.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, -1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
LogisticRegression(random_state=0)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
[[65 3]
[ 8 24]]
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 1,
stop=X_set[:, 0].max() + 1, step=0.01),
np.arange(start=X_set[:, 1].min() - 1,
stop=X_set[:, 1].max() + 1, step=0.01))
# Create a list of colors
colors = ['red', 'green']
cmap = ListedColormap(colors)
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape),
alpha=0.75, cmap=cmap)
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c=cmap(i), label=j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
C:\Users\somas\AppData\Local\Temp\ipykernel_11960\798797970.py:17:
UserWarning: *c* argument looks like a single numeric RGB or RGBA
sequence, which should be avoided as value-mapping will have
precedence in case its length matches with *x* & *y*. Please use
the *color* keyword-argument or provide a 2D array with a single row
if you intend to specify the same RGB or RGBA value for all points.
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start=X_set[:, 0].min() - 1,
stop=X_set[:, 0].max() + 1, step=0.01),
np.arange(start=X_set[:, 1].min() - 1,
stop=X_set[:, 1].max() + 1, step=0.01))
# Create a list of colors
colors = ['red', 'green']
cmap = ListedColormap(colors)
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape),
alpha=0.75, cmap=cmap)
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c=cmap(i), label=j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
C:\Users\somas\AppData\Local\Temp\ipykernel_11960\2670906848.py:17:
UserWarning: *c* argument looks like a single numeric RGB or RGBA
sequence, which should be avoided as value-mapping will have
precedence in case its length matches with *x* & *y*. Please use
the *color* keyword-argument or provide a 2D array with a single row
if you intend to specify the same RGB or RGBA value for all points.
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
Experiment-4: Exercises to solve the real-world problems using the following machine
learning methods:
c) Binary Classifier
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer(as_frame=True)
dataset['data'].head()
mean radius mean texture mean perimeter mean area mean
smoothness \
0 17.99 10.38 122.80 1001.0
0.11840
1 20.57 17.77 132.90 1326.0
0.08474
2 19.69 21.25 130.00 1203.0
0.10960
3 11.42 20.38 77.58 386.1
0.14250
4 20.29 14.34 135.10 1297.0
0.10030
mean compactness mean concavity mean concave points mean
symmetry \
0 0.27760 0.3001 0.14710
0.2419
1 0.07864 0.0869 0.07017
0.1812
2 0.15990 0.1974 0.12790
0.2069
3 0.28390 0.2414 0.10520
0.2597
4 0.13280 0.1980 0.10430
0.1809
mean fractal dimension ... worst radius worst texture worst
perimeter \
0 0.07871 ... 25.38 17.33
184.60
1 0.05667 ... 24.99 23.41
158.80
2 0.05999 ... 23.57 25.53
152.50
3 0.09744 ... 14.91 26.50
98.87
4 0.05883 ... 22.54 16.67
152.20
worst area worst smoothness worst compactness worst concavity
\
0 2019.0 0.1622 0.6656 0.7119
1 1956.0 0.1238 0.1866 0.2416
2 1709.0 0.1444 0.4245 0.4504
3 567.7 0.2098 0.8663 0.6869
4 1575.0 0.1374 0.2050 0.4000
worst concave points worst symmetry worst fractal dimension
0 0.2654 0.4601 0.11890
1 0.1860 0.2750 0.08902
2 0.2430 0.3613 0.08758
3 0.2575 0.6638 0.17300
4 0.1625 0.2364 0.07678
[5 rows x 30 columns]
dataset['target'].head()
0 0
1 0
2 0
3 0
4 0
Name: target, dtype: int32
dataset['target'].value_counts()
target
1 357
0 212
Name: count, dtype: int64
X = dataset['data']
y = dataset['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y ,
test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)
ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)
models = {}
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()
from sklearn.metrics import accuracy_score, precision_score,
recall_score
accuracy, precision, recall = {}, {}, {}
for key in models.keys():
# Fit the classifier
models[key].fit(X_train, y_train)
# Make predictions
predictions = models[key].predict(X_test)
# Calculate metrics
accuracy[key] = accuracy_score(predictions, y_test)
precision[key] = precision_score(predictions, y_test)
recall[key] = recall_score(predictions, y_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()
print('True Positive(TP) = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN) = ', TN)
print('False Negative(FN) = ', FN)
recall[key] = recall_score(predictions, y_test)
print(recall)
True Positive(TP) = 89
False Positive(FP) = 6
True Negative(TN) = 47
False Negative(FN) = 1
{'K-Nearest Neighbor': 0.9368421052631579}
accuracy = (TP + TN) / (TP + FP + TN + FN)
print('Accuracy of the binary classifier =
{:0.3f}'.format(accuracy))
precision=TP/(TP+FP)
print('Precision of the binary classifier =
{:0.3f}'.format(precision))
precision=TP/(TP+FP)
recall = 0.9368421052631579
print('Recall of the binary classifier = {:0.3f}'.format(recall))
F1_Score=(2*TP)/(2*TP+FP+FN)
print('F1_Score of the binary classifier =
{:0.3f}'.format(F1_Score))
Accuracy of the binary classifier = 0.951
Precision of the binary classifier = 0.937
Recall of the binary classifier = 0.937
F1_Score of the binary classifier = 0.962
Experiment-5: Develop a program for Bias, Variance, Remove duplicates , Cross
Validation
for bais & variance
# Load dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_iris
# Load iris dataset
X, y = load_iris(return_X_y=True)
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33, random_state=1)
# Model definition
model_lr = LinearRegression()
# Fitting training set into model
model_lr.fit(X_train, y_train)
# Predicting using Linear Regression Model
Prediction = model_lr.predict(X_test)
# Evaluating variance
Variance = np.var(Prediction)
print("Variance:", Variance)
Variance: 0.6317289156999706
# Evaluating SSE
SSE = np.mean((np.mean(Prediction) - y)** 2)
SSE
0.666933139470129
# Evaluating Variance
Bias = SSE - Variance
Bias
0.035204223770158416
for removing duplicate values
import pandas as pd
df=pd.read_csv('ML5.csv')
df.head(10)
symboling normalized-losses make fuel-type aspiration
num-of-doors \
0 3 ? alfa-romero gas std
two
1 3 ? alfa-romero gas std
two
2 1 ? alfa-romero gas std
two
3 2 164 audi gas std
four
4 2 164 audi gas std
four
5 2 ? audi gas std
two
6 1 158 audi gas std
four
7 1 ? audi gas std
four
8 1 158 audi gas turbo
four
9 0 ? audi gas turbo
two
body-style drive-wheels engine-location wheel-base ...
engine-size \
0 convertible rwd front 88.6 ...
130
1 convertible rwd front 88.6 ...
130
2 hatchback rwd front 94.5 ...
152
3 sedan fwd front 99.8 ...
109
4 sedan 4wd front 99.4 ...
136
5 sedan fwd front 99.8 ...
136
6 sedan fwd front 105.8 ...
136
7 wagon fwd front 105.8 ...
136
8 sedan fwd front 105.8 ...
131
9 hatchback 4wd front 99.5 ...
131
fuel-system bore stroke compression-ratio horsepower peak-rpm
city-mpg \
0 mpfi 3.47 2.68 9.0 111 5000
21
1 mpfi 3.47 2.68 9.0 111 5000
21
2 mpfi 2.68 3.47 9.0 154 5000
19
3 mpfi 3.19 3.4 10.0 102 5500
24
4 mpfi 3.19 3.4 8.0 115 5500
18
5 mpfi 3.19 3.4 8.5 110 5500
19
6 mpfi 3.19 3.4 8.5 110 5500
19
7 mpfi 3.19 3.4 8.5 110 5500
19
8 mpfi 3.13 3.4 8.3 140 5500
17
9 mpfi 3.13 3.4 7.0 160 5500
16
highway-mpg price
0 27 13495
1 27 16500
2 26 16500
3 30 13950
4 22 17450
5 25 15250
6 25 17710
7 25 18920
8 20 23875
9 22 ?
[10 rows x 26 columns]
df.drop_duplicates()
symboling normalized-losses make fuel-type
aspiration \
0 3 ? alfa-romero gas std
1 3 ? alfa-romero gas std
2 1 ? alfa-romero gas std
3 2 164 audi gas std
4 2 164 audi gas std
.. ... ... ... ... ...
200 -1 95 volvo gas std
201 -1 95 volvo gas turbo
202 -1 95 volvo gas std
203 -1 95 volvo diesel turbo
204 -1 95 volvo gas turbo
num-of-doors body-style drive-wheels engine-location wheel-
base ... \
0 two convertible rwd front
88.6 ...
1 two convertible rwd front
88.6 ...
2 two hatchback rwd front
94.5 ...
3 four sedan fwd front
99.8 ...
4 four sedan 4wd front
99.4 ...
.. ... ... ... ... .
.. ...
200 four sedan rwd front
109.1 ...
201 four sedan rwd front
109.1 ...
202 four sedan rwd front
109.1 ...
203 four sedan rwd front
109.1 ...
204 four sedan rwd front
109.1 ...
engine-size fuel-system bore stroke compression-ratio
horsepower \
0 130 mpfi 3.47 2.68 9.0
111
1 130 mpfi 3.47 2.68 9.0
111
2 152 mpfi 2.68 3.47 9.0
154
3 109 mpfi 3.19 3.4 10.0
102
4 136 mpfi 3.19 3.4 8.0
115
.. ... ... ... ... ...
...
200 141 mpfi 3.78 3.15 9.5
114
201 141 mpfi 3.78 3.15 8.7
160
202 173 mpfi 3.58 2.87 8.8
134
203 145 idi 3.01 3.4 23.0
106
204 141 mpfi 3.78 3.15 9.5
114
peak-rpm city-mpg highway-mpg price
0 5000 21 27 13495
1 5000 21 27 16500
2 5000 19 26 16500
3 5500 24 30 13950
4 5500 18 22 17450
.. ... ... ... ...
200 5400 23 28 16845
201 5300 19 25 19045
202 5500 18 23 21485
203 4800 26 27 22470
204 5400 19 25 22625
[205 rows x 26 columns]
df.drop_duplicates(subset=['make'])
symboling normalized-losses make fuel-type aspiration
\
0 3 ? alfa-romero gas std
3 2 164 audi gas std
10 2 192 bmw gas std
18 2 121 chevrolet gas std
21 1 118 dodge gas std
30 2 137 honda gas std
43 0 ? isuzu gas std
47 0 145 jaguar gas std
50 1 104 mazda gas std
67 -1 93 mercedes-benz diesel turbo
75 1 ? mercury gas turbo
76 2 161 mitsubishi gas std
89 1 128 nissan gas std
107 0 161 peugot gas std
118 1 119 plymouth gas std
125 3 186 porsche gas std
130 0 ? renault gas std
132 3 150 saab gas std
138 2 83 subaru gas std
150 1 87 toyota gas std
182 2 122 volkswagen diesel std
194 -2 103 volvo gas std
num-of-doors body-style drive-wheels engine-location wheel-
base ... \
0 two convertible rwd front
88.6 ...
3 four sedan fwd front
99.8 ...
10 two sedan rwd front
101.2 ...
18 two hatchback fwd front
88.4 ...
21 two hatchback fwd front
93.7 ...
30 two hatchback fwd front
86.6 ...
43 four sedan rwd front
94.3 ...
47 four sedan rwd front
113.0 ...
50 two hatchback fwd front
93.1 ...
67 four sedan rwd front
110.0 ...
75 two hatchback rwd front
102.7 ...
76 two hatchback fwd front
93.7 ...
89 two sedan fwd front
94.5 ...
107 four sedan rwd front
107.9 ...
118 two hatchback fwd front
93.7 ...
125 two hatchback rwd front
94.5 ...
130 four wagon fwd front
96.1 ...
132 two hatchback fwd front
99.1 ...
138 two hatchback fwd front
93.7 ...
150 two hatchback fwd front
95.7 ...
182 two sedan fwd front
97.3 ...
194 four sedan rwd front
104.3 ...
engine-size fuel-system bore stroke compression-ratio
horsepower \
0 130 mpfi 3.47 2.68 9.00
111
3 109 mpfi 3.19 3.4 10.00
102
10 108 mpfi 3.5 2.8 8.80
101
18 61 2bbl 2.91 3.03 9.50
48
21 90 2bbl 2.97 3.23 9.41
68
30 92 1bbl 2.91 3.41 9.60
58
43 111 2bbl 3.31 3.23 8.50
78
47 258 mpfi 3.63 4.17 8.10
176
50 91 2bbl 3.03 3.15 9.00
68
67 183 idi 3.58 3.64 21.50
123
75 140 mpfi 3.78 3.12 8.00
175
76 92 2bbl 2.97 3.23 9.40
68
89 97 2bbl 3.15 3.29 9.40
69
107 120 mpfi 3.46 3.19 8.40
97
118 90 2bbl 2.97 3.23 9.40
68
125 151 mpfi 3.94 3.11 9.50
143
130 132 mpfi 3.46 3.9 8.70
?
132 121 mpfi 3.54 3.07 9.31
110
138 97 2bbl 3.62 2.36 9.00
69
150 92 2bbl 3.05 3.03 9.00
62
182 97 idi 3.01 3.4 23.00
52
194 141 mpfi 3.78 3.15 9.50
114
peak-rpm city-mpg highway-mpg price
0 5000 21 27 13495
3 5500 24 30 13950
10 5800 23 29 16430
18 5100 47 53 5151
21 5500 37 41 5572
30 4800 49 54 6479
43 4800 24 29 6785
47 4750 15 19 32250
50 5000 30 31 5195
67 4350 22 25 25552
75 5000 19 24 16503
76 5500 37 41 5389
89 5200 31 37 5499
107 5000 19 24 11900
118 5500 37 41 5572
125 5500 19 27 22018
130 ? 23 31 9295
132 5250 21 28 11850
138 4900 31 36 5118
150 4800 35 39 5348
182 4800 37 46 7775
194 5400 23 28 12940
[22 rows x 26 columns]
df.drop_duplicates(subset=['fuel-type','body-style'])
symboling normalized-losses make fuel-type aspiration
\
0 3 ? alfa-romero gas std
2 1 ? alfa-romero gas std
3 2 164 audi gas std
7 1 ? audi gas std
63 0 ? mazda diesel std
68 -1 93 mercedes-benz diesel turbo
69 0 93 mercedes-benz diesel turbo
74 1 ? mercedes-benz gas std
159 0 91 toyota diesel std
num-of-doors body-style drive-wheels engine-location wheel-
base ... \
0 two convertible rwd front
88.6 ...
2 two hatchback rwd front
94.5 ...
3 four sedan fwd front
99.8 ...
7 four wagon fwd front
105.8 ...
63 ? sedan fwd front
98.8 ...
68 four wagon rwd front
110.0 ...
69 two hardtop rwd front
106.7 ...
74 two hardtop rwd front
112.0 ...
159 four hatchback fwd front
95.7 ...
engine-size fuel-system bore stroke compression-ratio
horsepower \
0 130 mpfi 3.47 2.68 9.0
111
2 152 mpfi 2.68 3.47 9.0
154
3 109 mpfi 3.19 3.4 10.0
102
7 136 mpfi 3.19 3.4 8.5
110
63 122 idi 3.39 3.39 22.7
64
68 183 idi 3.58 3.64 21.5
123
69 183 idi 3.58 3.64 21.5
123
74 304 mpfi 3.8 3.35 8.0
184
159 110 idi 3.27 3.35 22.5
56
peak-rpm city-mpg highway-mpg price
0 5000 21 27 13495
2 5000 19 26 16500
3 5500 24 30 13950
7 5500 19 25 18920
63 4650 36 42 10795
68 4350 22 25 28248
69 4350 22 25 28176
74 4500 14 16 45400
159 4500 38 47 7788
[9 rows x 26 columns]
df.drop_duplicates(subset=['fuel-type','body-style'],keep='last')
symboling normalized-losses make fuel-type aspiration
\
69 0 93 mercedes-benz diesel turbo
114 0 ? peugot diesel turbo
159 0 91 toyota diesel std
170 2 134 toyota gas std
189 3 ? volkswagen gas std
190 3 256 volkswagen gas std
199 -1 74 volvo gas turbo
203 -1 95 volvo diesel turbo
204 -1 95 volvo gas turbo
num-of-doors body-style drive-wheels engine-location wheel-
base ... \
69 two hardtop rwd front
106.7 ...
114 four wagon rwd front
114.2 ...
159 four hatchback fwd front
95.7 ...
170 two hardtop rwd front
98.4 ...
189 two convertible fwd front
94.5 ...
190 two hatchback fwd front
94.5 ...
199 four wagon rwd front
104.3 ...
203 four sedan rwd front
109.1 ...
204 four sedan rwd front
109.1 ...
engine-size fuel-system bore stroke compression-ratio
horsepower \
69 183 idi 3.58 3.64 21.5
123
114 152 idi 3.7 3.52 21.0
95
159 110 idi 3.27 3.35 22.5
56
170 146 mpfi 3.62 3.5 9.3
116
189 109 mpfi 3.19 3.4 8.5
90
190 109 mpfi 3.19 3.4 8.5
90
199 130 mpfi 3.62 3.15 7.5
162
203 145 idi 3.01 3.4 23.0
106
204 141 mpfi 3.78 3.15 9.5
114
peak-rpm city-mpg highway-mpg price
69 4350 22 25 28176
114 4150 25 25 17075
159 4500 38 47 7788
170 4800 24 30 11199
189 5500 24 29 11595
190 5500 24 29 9980
199 5100 17 22 18950
203 4800 26 27 22470
204 5400 19 25 22625
[9 rows x 26 columns]
for cross - validation:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score
X, y = datasets.load_iris(return_X_y=True)
clf = DecisionTreeClassifier(random_state=42)
k_folds = KFold(n_splits = 5)
scores = cross_val_score(clf, X, y, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))
Cross Validation Scores: [1. 1. 0.83333333
0.93333333 0.8 ]
Average CV Score: 0.9133333333333333
Number of CV Scores used in Average: 5
.
.
Experiment-6: Write a program to implement Categorical Encoding, One-hot Encoding
import pandas as pd
df = pd.read_csv('homeprices.csv')
df
{"table":{"data":{"area":
[2600,3000,3200,3600,4000,2600,2800,3300,3600,2600,2900,3100,3600],"
index":[0,1,2,3,4,5,6,7,8,9,10,11,12],"price":
[550000,565000,610000,680000,725000,585000,615000,650000,710000,5750
00,600000,620000,695000],"town":["monroe township","monroe
township","monroe township","monroe township","monroe
township","west windsor","west windsor","west windsor","west
windsor","robinsville","robinsville","robinsville","robinsville"]},"
schema":{"fields":[{"name":"index","type":"integer"},
{"name":"town","type":"string"},{"name":"area","type":"integer"},
{"name":"price","type":"integer"}],"pandas_version":"1.4.0","primary
Key":["index"]}},"total_rows":13,"truncation_type":null}
dummies=pd.get_dummies(df.town)
dummies
merged=pd.concat([df,dummies],axis='columns')
merged
{"table":{"data":{"area":
[2600,3000,3200,3600,4000,2600,2800,3300,3600,2600,2900,3100,3600],"
index":[0,1,2,3,4,5,6,7,8,9,10,11,12],"monroe township":
[1,1,1,1,1,0,0,0,0,0,0,0,0],"price":
[550000,565000,610000,680000,725000,585000,615000,650000,710000,5750
00,600000,620000,695000],"robinsville":
[0,0,0,0,0,0,0,0,0,1,1,1,1],"town":["monroe township","monroe
township","monroe township","monroe township","monroe
township","west windsor","west windsor","west windsor","west
windsor","robinsville","robinsville","robinsville","robinsville"],"w
est windsor":[0,0,0,0,0,1,1,1,1,0,0,0,0]},"schema":{"fields":
[{"name":"index","type":"integer"},{"name":"town","type":"string"},
{"name":"area","type":"integer"},{"name":"price","type":"integer"},
{"name":"monroe township","type":"integer"},
{"name":"robinsville","type":"integer"},{"name":"west
windsor","type":"integer"}],"pandas_version":"1.4.0","primaryKey":
["index"]}},"total_rows":13,"truncation_type":null}
final=merged.drop(['town','west windsor'],axis='columns')
final
{"table":{"data":{"area":
[2600,3000,3200,3600,4000,2600,2800,3300,3600,2600,2900,3100,3600],"
index":[0,1,2,3,4,5,6,7,8,9,10,11,12],"monroe township":
[1,1,1,1,1,0,0,0,0,0,0,0,0],"price":
[550000,565000,610000,680000,725000,585000,615000,650000,710000,5750
00,600000,620000,695000],"robinsville":
[0,0,0,0,0,0,0,0,0,1,1,1,1]},"schema":{"fields":
[{"name":"index","type":"integer"},{"name":"area","type":"integer"},
{"name":"price","type":"integer"},{"name":"monroe
township","type":"integer"},
{"name":"robinsville","type":"integer"}],"pandas_version":"1.4.0","p
rimaryKey":["index"]}},"total_rows":13,"truncation_type":null}
from sklearn.linear_model import LinearRegression
model=LinearRegression()
x=final.drop(['price'],axis='columns')
x
{"table":{"data":{"area":
[2600,3000,3200,3600,4000,2600,2800,3300,3600,2600,2900,3100,3600],"
index":[0,1,2,3,4,5,6,7,8,9,10,11,12],"monroe township":
[1,1,1,1,1,0,0,0,0,0,0,0,0],"robinsville":
[0,0,0,0,0,0,0,0,0,1,1,1,1]},"schema":{"fields":
[{"name":"index","type":"integer"},{"name":"area","type":"integer"},
{"name":"monroe township","type":"integer"},
{"name":"robinsville","type":"integer"}],"pandas_version":"1.4.0","p
rimaryKey":["index"]}},"total_rows":13,"truncation_type":null}
y=final.price
y
0 550000
1 565000
2 610000
3 680000
4 725000
5 585000
6 615000
7 650000
8 710000
9 575000
10 600000
11 620000
12 695000
Name: price, dtype: int64
model.fit(x,y)
LinearRegression()
model.predict([[28,0,1]])
array([239015.93205768])
model.predict([[3400,0,0]])
array([681241.66845839])
model.score(x,y)
0.9573929037221872
Experiment-7: Build an Artificial Neural Network by implementing the Back
propagation algorithm and test the same using appropriate data sets.
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X / np.amax(X, axis=0) # maximum of X array longitudinally
y = y / 100
# Sigmoid Function
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# Derivative of Sigmoid Function
def derivatives_sigmoid(x):
return x * (1 - x)
# Variable initialization
epoch = 5 # Setting training iterations
lr = 0.1 # Setting learning rate
inputlayer_neurons = 2 # number of features in data set
hiddenlayer_neurons = 3 # number of hidden layers neurons
output_neurons = 1 # number of neurons at output layer
# weight and bias initialization
wh = np.random.uniform(size=(inputlayer_neurons,
hiddenlayer_neurons))
bh = np.random.uniform(size=(1, hiddenlayer_neurons))
wout = np.random.uniform(size=(hiddenlayer_neurons, output_neurons))
bout = np.random.uniform(size=(1, output_neurons))
# draws a random range of numbers uniformly of dim x*y
for i in range(epoch):
# Forward Propagation
hinp1 = np.dot(X, wh)
hinp = hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1 = np.dot(hlayer_act, wout)
outinp = outinp1 + bout
output = sigmoid(outinp)
# Backpropagation
EO = y - output
outgrad = derivatives_sigmoid(output)
d_output = EO * outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act) # how much hidden
layer wts contributed to error
d_hiddenlayer = EH * hiddengrad
wout += hlayer_act.T.dot(d_output) * lr # dot product of next
layer error and current layer output
wh += X.T.dot(d_hiddenlayer) * lr
print("-----------Epoch-", i + 1, "Starts----------")
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n", output)
print("-----------Epoch-", i + 1, "Ends----------\n")
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n", output)
-----------Epoch- 1 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83727262]
[0.81890898]
[0.84136943]]
-----------Epoch- 1 Ends----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83727262]
[0.81890898]
[0.84136943]]
-----------Epoch- 2 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83789076]
[0.81951306]
[0.84198315]]
-----------Epoch- 2 Ends----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83789076]
[0.81951306]
[0.84198315]]
-----------Epoch- 3 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83849879]
[0.8201075 ]
[0.84258678]]
-----------Epoch- 3 Ends----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83849879]
[0.8201075 ]
[0.84258678]]
-----------Epoch- 4 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83909695]
[0.82069254]
[0.84318055]]
-----------Epoch- 4 Ends----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83909695]
[0.82069254]
[0.84318055]]
-----------Epoch- 5 Starts----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83968547]
[0.82126838]
[0.8437647 ]]
-----------Epoch- 5 Ends----------
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.83968547]
[0.82126838]
[0.8437647 ]]
Experiment-8: Write a program to implement k-Nearest Neighbor algorithm to
classify the iris data set. Print both correct and wrong predictions.
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
dataset = load_iris()
X_train, X_test, y_train, y_test = train_test_split(dataset["data"],
dataset["target"], random_state=0)
kn = KNeighborsClassifier(n_neighbors=1)
kn.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=1)
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='uniform')
KNeighborsClassifier(n_neighbors=1)
for i in range(len(X_test)):
x = X_test[i]
x_new = np.array([x])
prediction = kn.predict(x_new)
print("TARGET=", y_test[i], dataset["target_names"][y_test[i]],
"PREDICTED=", prediction, dataset["target_names"][prediction[0]])
print(kn.score(X_test, y_test))
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 2 virginica PREDICTED= [2] virginica
TARGET= 1 versicolor PREDICTED= [1] versicolor
TARGET= 0 setosa PREDICTED= [0] setosa
TARGET= 1 versicolor PREDICTED= [2] virginica
0.9736842105263158
Experiment-9: Implement the non-parametric Locally Weighted Regression
algorithm in order to fit data points. Select the appropriate data set for your
experiment and draw graphs.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def kernel(point, xmat, k):
m, n = np.shape(xmat)
weights = np.mat(np.eye((m))) # eye - identity matrix
for j in range(m):
diff = point - X[j]
weights[j, j] = np.exp(diff * diff.T / (-2.0 * k**2))
return weights
def localWeight(point, xmat, ymat, k):
wei = kernel(point, xmat, k)
W = (X.T * (wei * X)).I * (X.T * (wei * ymat.T))
return W
def localWeightRegression(xmat, ymat, k):
m, n = np.shape(xmat)
ypred = np.zeros(m)
for i in range(m):
ypred[i] = xmat[i] * localWeight(xmat[i], xmat, ymat, k)
return ypred
def graphPlot(X, ypred):
sortindex = X[:, 1].argsort(0) # argsort - index of the
smallest
xsort = X[sortindex][:, 0]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(bill, tip, color='green')
ax.plot(xsort[:, 1], ypred[sortindex], color='red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show()
# load data points
data = pd.read_csv('10-dataset.csv')
bill = np.array(data.total_bill) # We use only Bill amount and Tips
data
tip = np.array(data.tip)
mbill = np.mat(bill) # .mat will convert nd array is converted in
2D array
mtip = np.mat(tip)
m = np.shape(mbill)[1]
one = np.mat(np.ones(m))
X = np.hstack((one.T, mbill.T)) # 244 rows, 2 cols
ypred = localWeightRegression(X, mtip, 8) # increase k to get
smooth curves
graphPlot(X, ypred)