0% found this document useful (0 votes)
65 views

ML Lab Codes

The document discusses various machine learning techniques applied to a diabetes dataset. It performs data preprocessing like rescaling, standardization, normalization and binarization. It analyzes the data using univariate histograms, density and box plots as well as correlation and scatter matrix plots. It implements logistic regression, k-nearest neighbors and linear discriminant analysis models for classification. Feature selection techniques like univariate selection, recursive feature elimination and principal component analysis are also applied. Model performance is evaluated using cross-validation scores for metrics like mean absolute error, mean squared error and R-squared.

Uploaded by

Thor
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
65 views

ML Lab Codes

The document discusses various machine learning techniques applied to a diabetes dataset. It performs data preprocessing like rescaling, standardization, normalization and binarization. It analyzes the data using univariate histograms, density and box plots as well as correlation and scatter matrix plots. It implements logistic regression, k-nearest neighbors and linear discriminant analysis models for classification. Feature selection techniques like univariate selection, recursive feature elimination and principal component analysis are also applied. Model performance is evaluated using cross-validation scores for metrics like mean absolute error, mean squared error and R-squared.

Uploaded by

Thor
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 14

#SET-1

#A.To display all arithmetic operations using NumPy arrays.


import numpy as np
z1=np.array([[1,2,3],[4,5,6]])
z2=np.array([[7,8,9],[10,11,12]])
zadd=np.add(z1,z2)
print(zadd)
zsub=np.subtract(z1,z2)
print(zsub)
zmul=np.multiply(z1,z2)
print(zmul)
zdiv=np.divide(z1,z2)
print(zdiv)
zfdiv=np.floor_divide(z1,z2)
print(zfdiv)
zmod=np.mod(z1,z2)
print(zmod)
zpow=np.power(z1,z2)
print(zpow)
z1neg=np.negative(z1)
print(z1neg)
print("--------------------------------------------------------")
#B.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#c.Analyze the sample data by plotting a uni-variate histogram plot, and a
correlation matrix plot.
#Histogram plot
import matplotlib.pyplot as plt
df.hist()
plt.rcParams['figure.figsize']=[40,30]
plt.show()
#correlation matrix plot
import seaborn as sns
correlations = df.corr()
sns.heatmap(correlations, annot=True, cmap='coolwarm', vmin=-1, vmax=1,
xticklabels=list(df.columns), yticklabels=list(df.columns))
plt.show()
print("--------------------------------------------------------")
#D.Split the data into Train and Test Sets of Pima Indians dataset into 67%,33%
respectively and implement Logistic Regression model.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model = LogisticRegression(max_iter=500)
model.fit(X_train, Y_train)
#E.Evaluate the performance of the algorithm by calculating Regression Metrics(Mean
Absolute Error, Mean Squared Error and RSquared)
#Cross Validation Regression MAE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_absolute_error')
print("MAE:",results.mean(), results.std())
#Cross Validation Regression MSE
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("MSE:",-1*results.mean(), results.std())
#Cross Validation Regression R^2
results = cross_val_score(model, X, Y, cv=kfold, scoring='r2')
print("R^2:",results.mean(), results.std())

===================================================================================
===============
#SET-2
#A.Create data frame and access the data in a Pandas data frame.
import pandas as pd
df=pd.read_csv('/content/diabetes.csv')
print(df.head())
print(df.tail())
print(df.sample(10))
print(df.columns)
print(df.shape)
print(df[10:21])
specific_data=[df["Glucose"]]
print(specific_data)
print(df.iloc[5])
print(df["Insulin"].value_counts())
print(df["Insulin"].sum())
print(df["Insulin"].mean())
print(df["Insulin"].median())
print(df["Insulin"].min())
print(df["Insulin"].max())
newcols={"BloodPressure":"BP"}
df.rename(columns=newcols,inplace=True)
print(df)
print(df.isnull())
print("--------------------------------------------------------")
#B.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#c.Analyze the sample data by plotting a uni-variate density plot and multivariate
scatter plot.
#Density plot
import matplotlib.pyplot as plt
df.plot(kind='density', subplots=True, layout=(3,3))
plt.show()
#Scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(df)
plt.show()
print("--------------------------------------------------------")
#D.To implement Linear Discriminant Analysis (LDA) on the dataset
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
print("--------------------------------------------------------")
#E.Evaluate the performance of the algorithm by calculating Regression Metrics(Mean
Absolute Error, Mean Squared Error and RSquared)
#Cross Validation Regression MAE
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_absolute_error')
print("MAE:",results.mean(), results.std())
#Cross Validation Regression MSE
results = cross_val_score(model, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("MSE:",-1*results.mean(), results.std())
#Cross Validation Regression R^2
results = cross_val_score(model, X, Y, cv=kfold, scoring='r2')
print("R^2:",results.mean(), results.std())

===================================================================================
=======

#SET-3
#A.To implement Feature Selection Techniques on a sample data set.
#B.Program to implement Univariate Selection using chi-squared (chi2) statistical
test for non-negative features to select 4 of the best features.
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
df=pd.read_csv('/content/diabetes.csv')
array=df.values
X=array[:,0:8]
Y=array[:,8]
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features[0:5,:])
print("--------------------------------------------------------")
#C. Recursive Feature Elimination RFE with the logistic regression Algorithm to
select the top 3 features.
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
model = LogisticRegression(max_iter=256)
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, Y)
print("Num Features:",fit.n_features_)
print("Selected Features:",fit.support_)
print("Feature Ranking:", fit.ranking_)
print("--------------------------------------------------------")
#D.Principle Component Analysis PCA and select 3 principal components.
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
fit = pca.fit(X)
print("Explained Variance:",fit.explained_variance_ratio_)
print(fit.components_)
============================================================================

#SET-4
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
import numpy as np
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement k-Nearest Neighbors.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
model = KNeighborsClassifier()
results_knn = cross_val_score(model, X, Y, cv=kfold)
print(results_knn.mean())
print("--------------------------------------------------------")
#C. Analyze the sample data by plotting a uni-variate box plots and a multi-variate
correlation matrix plot
#Box plot
import matplotlib.pyplot as plt
df.plot(kind='box', subplots=True, layout=(3,3))
plt.show()
#correlation matrix plot
import seaborn as sns
correlations = df.corr()
sns.heatmap(correlations, annot=True, cmap='coolwarm', vmin=-1, vmax=1,
xticklabels=list(df.columns), yticklabels=list(df.columns))
plt.show()
print("--------------------------------------------------------")
#D.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())

===================================================================================
========

#SET-5
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement Naive Bayes.
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = GaussianNB()
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results_nb = cross_val_score(model, X, Y, cv=kfold)
print(results_nb.mean())
print("--------------------------------------------------------")
#C.Analyze the sample data by plotting a uni-variate Whisker plots and a multi-
variate Scatter plot
#Whisker/box plot
import matplotlib.pyplot as plt
df.plot(kind='box', subplots=True, layout=(3,3))
plt.show()
#Scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(df)
plt.show()
print("--------------------------------------------------------")
#D.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())
=============================================================
#SET-6
#A.To display all arithmetic operations using NumPy arrays.
import numpy as np
z1=np.array([[1,2,3],[4,5,6]])
z2=np.array([[7,8,9],[10,11,12]])
zadd=np.add(z1,z2)
print(zadd)
zsub=np.subtract(z1,z2)
print(zsub)
zmul=np.multiply(z1,z2)
print(zmul)
zdiv=np.divide(z1,z2)
print(zdiv)
zfdiv=np.floor_divide(z1,z2)
print(zfdiv)
zmod=np.mod(z1,z2)
print(zmod)
zpow=np.power(z1,z2)
print(zpow)
z1neg=np.negative(z1)
print(z1neg)
print("--------------------------------------------------------")
#B.To implement Classification id3 Decision tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv('/content/diabetes.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
model = DecisionTreeClassifier(criterion="entropy")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
prediction=model.predict(X_test)
print("Predicted Class Index:",np.argmax(prediction))
#C.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())

===================================================================================
======
#SET-7
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement Support Vector Machines
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = SVC()
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results_svm = cross_val_score(model, X, Y, cv=kfold)
print(results_svm.mean())
print("--------------------------------------------------------")
#C.Analyze the sample data by plotting a uni-variate Whisker plot and a multi-
variate Scatter plot
#Whisker plot
import matplotlib.pyplot as plt
df.plot(kind='box', subplots=True, layout=(3,3))
plt.show()
#Scatter plot
from pandas.plotting import scatter_matrix
scatter_matrix(df)
plt.show()
print("--------------------------------------------------------")
#D.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())

===================================================================================
======

#SET-8
#A.To display all arithmetic operations using NumPy arrays.
import numpy as np
z1=np.array([[1,2,3],[4,5,6]])
z2=np.array([[7,8,9],[10,11,12]])
zadd=np.add(z1,z2)
print(zadd)
zsub=np.subtract(z1,z2)
print(zsub)
zmul=np.multiply(z1,z2)
print(zmul)
zdiv=np.divide(z1,z2)
print(zdiv)
zfdiv=np.floor_divide(z1,z2)
print(zfdiv)
zmod=np.mod(z1,z2)
print(zmod)
zpow=np.power(z1,z2)
print(zpow)
z1neg=np.negative(z1)
print(z1neg)
print("--------------------------------------------------------")
#B.To implement Random Forest algorithm
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
df=pd.read_csv('/content/diabetes.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
model = RandomForestClassifier(n_estimators=100, max_features=3)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
print("--------------------------------------------------------")
#C.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
results = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",results.mean(), results.std())

============================================================================
#SET-9
#A.Create data frame and access the data in a Pandas data frame.
import pandas as pd
df=pd.read_csv('/content/diabetes.csv')
print(df.head())
print(df.tail())
print(df.sample(10))
print(df.columns)
print(df.shape)
print(df[10:21])
specific_data=[df["Glucose"]]
print(specific_data)
print(df.iloc[5])
print(df["Insulin"].value_counts())
print(df["Insulin"].sum())
print(df["Insulin"].mean())
print(df["Insulin"].median())
print(df["Insulin"].min())
print(df["Insulin"].max())
newcols={"BloodPressure":"BP"}
df.rename(columns=newcols,inplace=True)
print(df)
print(df.isnull())
print("--------------------------------------------------------")
#B.Principle Component Analysis PCA and select 3 principal components.
from sklearn.decomposition import PCA
array=df.values
X = array[:,0:8]
Y = array[:,8]
pca = PCA(n_components=3)
fit = pca.fit(X)
print("Explained Variance:",fit.explained_variance_ratio_)
print(fit.components_)
print("--------------------------------------------------------")
#C.Combine Models into Ensemble Predictions on the data set using AdaBoost
algorithm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = AdaBoostClassifier(n_estimators=100, random_state=12)
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

==================================================================================

#SET-10
#A.To Perform the data preprocessing techniques on the dataset (Rescale,
standardize,normalize, binarize )
import pandas as pd
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
df = pd.read_csv('/content/diabetes.csv')
array = df.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
set_printoptions(precision=3)
# Rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
print(rescaledX[0:5,:])
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
print(rescaledX[0:5,:])
# Normalize data (length of 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)
print(normalizedX[0:5,:])
# binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5,:])
print("--------------------------------------------------------")
#B.To implement non-linear machine learning algorithms k-Nearest Neighbors and
Naive Bayes.
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
model = KNeighborsClassifier()
kfold = KFold(n_splits=10, random_state=12,shuffle=True)
results_knn = cross_val_score(model, X, Y, cv=kfold)
print(results_knn.mean())
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
results_nb = cross_val_score(model, X, Y, cv=kfold)
print(results_nb.mean())
print("--------------------------------------------------------")
#C.Evaluate the performance of the algorithm by Accuracy,
ConfusionMatrix ,Precision ,Recall, F-Score, AUC(Area Under the Curve)-ROC
#Accuracy
results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
print("Accuracy:",results.mean(), results.std())
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.33,random_state=12)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
#Precision
from sklearn.metrics import precision_score
precision = precision_score(Y_test, predicted)
print("Precision:", precision)
#Recall
from sklearn.metrics import recall_score
recall = recall_score(Y_test, predicted)
print("Recall:", recall)
#F-score
from sklearn.metrics import f1_score
f1=f1_score(Y_test,predicted)
print("F1 score:", f1)
#AUC-ROC
aucresults = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
print("AUC:",aucresults.mean(), aucresults.std())
print("--------------------------------------------------------")
#D.Analyze the performance metrics by plotting a graph
import matplotlib.pyplot as plt
import numpy as np
metrics = ['Accuracy', 'Precision', 'Recall', 'F-score', 'AUC-ROC']
scores = [results.mean(), precision, recall, f1, aucresults.mean()]
plt.bar(np.arange(len(metrics)), scores)
plt.xticks(np.arange(len(metrics)), metrics)
plt.ylabel('Score')
plt.show()

You might also like