ML Lab Codes
ML Lab Codes
===================================================================================
===============
#SET-2
#A.Create data frame and access the data in a Pandas data frame.
import pandas as pd
df=pd.read_csv('/content/diabetes.csv')
print(df.head())
print(df.tail())
print(df.sample(10))
print(df.columns)
print(df.shape)
print(df[10:21])
specific_data=[df["Glucose"]]
print(specific_data)
print(df.iloc[5])
print(df["Insulin"].value_counts())
print(df["Insulin"].sum())
print(df["Insulin"].mean())
print(df["Insulin"].median())
print(df["Insulin"].min())
print(df["Insulin"].max())
newcols={"BloodPressure":"BP"}
df.rename(columns=newcols,inplace=True)
print(df)
print(df.isnull())
print("--------------------------------------------------------")
#B.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#c.Analyze the sample data by plotting a uni-variate density plot and multivariate
scatter plot.
#Density plot
import matplotlib.pyplot as plt
df.plot(kind='density', subplots=True, layout=(3,3))
plt.show()
#Scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(df)
plt.show()
print("--------------------------------------------------------")
#D.To implement Linear Discriminant Analysis (LDA) on the dataset
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
print("--------------------------------------------------------")
#E.Evaluate the performance of the algorithm by calculating Regression Metrics(Mean
Absolute Error, Mean Squared Error and RSquared)
#Cross Validation Regression MAE
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_absolute_error')
print("MAE:",results.mean(), results.std())
#Cross Validation Regression MSE
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("MSE:",-1*results.mean(), results.std())
#Cross Validation Regression R^2
results = cross_val_score(model, X, Y, cv=kfold, scoring='r2')
print("R^2:",results.mean(), results.std())
===================================================================================
=======
#SET-3
#A.To implement Feature Selection Techniques on a sample data set.
#B.Program to implement Univariate Selection using chi-squared (chi2) statistical
test for non-negative features to select 4 of the best features.
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
df=pd.read_csv('/content/diabetes.csv')
array=df.values
X=array[:,0:8]
Y=array[:,8]
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features[0:5,:])
print("--------------------------------------------------------")
#C. Recursive Feature Elimination RFE with the logistic regression Algorithm to
select the top 3 features.
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
model = LogisticRegression(max_iter=256)
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, Y)
print("Num Features:",fit.n_features_)
print("Selected Features:",fit.support_)
print("Feature Ranking:", fit.ranking_)
print("--------------------------------------------------------")
#D.Principle Component Analysis PCA and select 3 principal components.
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
fit = pca.fit(X)
print("Explained Variance:",fit.explained_variance_ratio_)
print(fit.components_)
============================================================================
#SET-4
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
import numpy as np
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement k-Nearest Neighbors.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
model = KNeighborsClassifier()
results_knn = cross_val_score(model, X, Y, cv=kfold)
print(results_knn.mean())
print("--------------------------------------------------------")
#C. Analyze the sample data by plotting a uni-variate box plots and a multi-variate
correlation matrix plot
#Box plot
import matplotlib.pyplot as plt
df.plot(kind='box', subplots=True, layout=(3,3))
plt.show()
#correlation matrix plot
import seaborn as sns
correlations = df.corr()
sns.heatmap(correlations, annot=True, cmap='coolwarm', vmin=-1, vmax=1,
xticklabels=list(df.columns), yticklabels=list(df.columns))
plt.show()
print("--------------------------------------------------------")
#D.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())
===================================================================================
========
#SET-5
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement Naive Bayes.
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = GaussianNB()
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results_nb = cross_val_score(model, X, Y, cv=kfold)
print(results_nb.mean())
print("--------------------------------------------------------")
#C.Analyze the sample data by plotting a uni-variate Whisker plots and a multi-
variate Scatter plot
#Whisker/box plot
import matplotlib.pyplot as plt
df.plot(kind='box', subplots=True, layout=(3,3))
plt.show()
#Scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(df)
plt.show()
print("--------------------------------------------------------")
#D.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())
=============================================================
#SET-6
#A.To display all arithmetic operations using NumPy arrays.
import numpy as np
z1=np.array([[1,2,3],[4,5,6]])
z2=np.array([[7,8,9],[10,11,12]])
zadd=np.add(z1,z2)
print(zadd)
zsub=np.subtract(z1,z2)
print(zsub)
zmul=np.multiply(z1,z2)
print(zmul)
zdiv=np.divide(z1,z2)
print(zdiv)
zfdiv=np.floor_divide(z1,z2)
print(zfdiv)
zmod=np.mod(z1,z2)
print(zmod)
zpow=np.power(z1,z2)
print(zpow)
z1neg=np.negative(z1)
print(z1neg)
print("--------------------------------------------------------")
#B.To implement Classification id3 Decision tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv('/content/diabetes.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
model = DecisionTreeClassifier(criterion="entropy")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
prediction=model.predict(X_test)
print("Predicted Class Index:",np.argmax(prediction))
#C.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())
===================================================================================
======
#SET-7
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement Support Vector Machines
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = SVC()
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results_svm = cross_val_score(model, X, Y, cv=kfold)
print(results_svm.mean())
print("--------------------------------------------------------")
#C.Analyze the sample data by plotting a uni-variate Whisker plot and a multi-
variate Scatter plot
#Whisker plot
import matplotlib.pyplot as plt
df.plot(kind='box', subplots=True, layout=(3,3))
plt.show()
#Scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(df)
plt.show()
print("--------------------------------------------------------")
#D.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())
===================================================================================
======
#SET-8
#A.To display all arithmetic operations using NumPy arrays.
import numpy as np
z1=np.array([[1,2,3],[4,5,6]])
z2=np.array([[7,8,9],[10,11,12]])
zadd=np.add(z1,z2)
print(zadd)
zsub=np.subtract(z1,z2)
print(zsub)
zmul=np.multiply(z1,z2)
print(zmul)
zdiv=np.divide(z1,z2)
print(zdiv)
zfdiv=np.floor_divide(z1,z2)
print(zfdiv)
zmod=np.mod(z1,z2)
print(zmod)
zpow=np.power(z1,z2)
print(zpow)
z1neg=np.negative(z1)
print(z1neg)
print("--------------------------------------------------------")
#B.To implement Random Forest algorithm
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
df=pd.read_csv('/content/diabetes.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
model = RandomForestClassifier(n_estimators=100, max_features=3)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
print("--------------------------------------------------------")
#C.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())
============================================================================
#SET-9
#A.Create data frame and access the data in a Pandas data frame.
import pandas as pd
df=pd.read_csv('/content/diabetes.csv')
print(df.head())
print(df.tail())
print(df.sample(10))
print(df.columns)
print(df.shape)
print(df[10:21])
specific_data=[df["Glucose"]]
print(specific_data)
print(df.iloc[5])
print(df["Insulin"].value_counts())
print(df["Insulin"].sum())
print(df["Insulin"].mean())
print(df["Insulin"].median())
print(df["Insulin"].min())
print(df["Insulin"].max())
newcols={"BloodPressure":"BP"}
df.rename(columns=newcols,inplace=True)
print(df)
print(df.isnull())
print("--------------------------------------------------------")
#B.Principle Component Analysis PCA and select 3 principal components.
from sklearn.decomposition import PCA
array=df.values
X = array[:,0:8]
Y = array[:,8]
pca = PCA(n_components=3)
fit = pca.fit(X)
print("Explained Variance:",fit.explained_variance_ratio_)
print(fit.components_)
print("--------------------------------------------------------")
#C.Combine Models into Ensemble Predictions on the data set using AdaBoost
algorithm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = AdaBoostClassifier(n_estimators=100, random_state=12)
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
==================================================================================
#SET-10
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement non-linear machine learning algorithms k-Nearest Neighbors and
Naive Bayes.
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = KNeighborsClassifier()
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results_knn = cross_val_score(model, X, Y, cv=kfold)
print(results_knn.mean())
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
results_nb = cross_val_score(model, X, Y, cv=kfold)
print(results_nb.mean())
print("--------------------------------------------------------")
#C.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
aucresults = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",aucresults.mean(), aucresults.std())
print("--------------------------------------------------------")
#D.Analyze the performance metrics by plotting a graph
import matplotlib.pyplot as plt
import numpy as np
metrics = ['Accuracy', 'Precision', 'Recall', 'F-score', 'AUC-ROC']
scores = [results.mean(), precision, recall, f1, aucresults.mean()]
plt.bar(np.arange(len(metrics)), scores)
plt.xticks(np.arange(len(metrics)), metrics)
plt.ylabel('Score')
plt.show()