Data Science Lab Program Printout
Data Science Lab Program Printout
Output:
Ex No: 2
Using Numpy Arrays
Program:
import numpy as np
a = np.array([[1,2,3], [4,5,6], [7,8,9]])
print("The first matrix value is ::>",a)
b = np.array([[2,3,4],[5,6,7], [8,9,10]])
print("The second matrix value is ::>",b)
mul= np.multiply(a,b)
add= np.add(a,b)
sub=np.subtract(a,b)
div=np.divide(a,b)
print("Addition Matrix Resultant is ::>",add)
print("Subtraction Matrix Resultant is ::>",sub)
print("Division Matrix Resultant is ::>",div)
print("Multiplication Matrix Resultant is ::>",mul)
Output:
The first matrix value is ::> [[1 2 3]
[4 5 6]
[7 8 9]]
The second matrix value is ::>[[ 2 3 4]
[ 5 6 7]
[ 8 9 10]]
Addition Matrix Resultant is ::>[[ 3 5 7]
[ 9 11 13]
[15 17 19]]
Subtraction Matrix Resultant is ::> [[-1 -1 -1]
[-1 -1 -1]
[-1 -1 -1]]
Division Matrix Resultant is ::>
[[0.5 0.66666667 0.75]
[0.8 0.83333333 0.85714286]
[0.875 0.88888889 0.9]]
Multiplication Matrix Resultant is ::>[[ 2 6 12]
[20 30 42]
[56 72 90]]
Ex No :3
Working with Pandas data frames
Program:
import pandas as pd
df = pd.DataFrame({'Name': ['Alberto Franco','Gino Mcneill','Ryan Parkes', 'Eesha Hinton', 'Syed
Wharton'], 'Date_Of_Birth ': '17/05/2002','16/02/1999','25/09/1998','11/05/2002','15/09/1997'],
'Age': [18.5, 21.2, 22.5, 22, 23]})
print("Original DataFrame:")
print(df)
df1 = df.copy(deep = True)df
= df.drop([0, 1])
df1 = df1.drop([2])
print("\nNew DataFrames:")
print(df)
print(df1)
print('\n"one_to_one”: check if merge keys are unique in both left and right datasets:"')
df_one_to_one = pd.merge(df, df1, validate = "one_to_one")
print(df_one_to_one)
print('\n"one_to_many” or “1:m”: check if merge keys are unique in left
dataset:') df_one_to_many = pd.merge(df, df1, validate = "one_to_many")
print(df_one_to_many)
print('“many_to_one” or “m:1”: check if merge keys are unique in right
dataset:') df_many_to_one = pd.merge(df, df1, validate = "many_to_one")
print(df_many_to_one)
Output:
Original DataFrame:
Name Date_Of_Birth Age
0 Alberto Franco 17/05/2002 18.5
1 Gino Mcneill 16/02/1999 21.2
2 Ryan Parkes 25/09/1998 22.5
3 Eesha Hinton 11/05/2002 22.0
4 Syed Wharton 15/09/1997 23.0
New DataFrames:
Name Date_Of_Birth
Age
Reading data from text files, Excel and the web and exploring various commands
for doing descriptive analytics on the Iris data set.
Program:
#Data Collect
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dataset=pd.read_csv("iris.txt")
dataset.head()
dataset=pd.read_excel("iris.xlsx")
dataset.head()
dataset=pd.read_csv("iris.csv")
dataset.head()
dataset.info()
dataset.Species.unique()
#EDA
dataset.describe()
dataset.corr()
dataset.Species.value_counts()
sns.FacetGrid(dataset,hue="Species",size=6).map(plt.scatter,"Sepal.Length","Sepal.Width").add_legen
d()
sns.FacetGrid(dataset,hue="Species",size=6).map(plt.scatter,"Petal.Length","Petal.Width").aadd_legen
d()
sns.pairplot(dataset,hue="Species")
plt.hist(dataset["Sepal.Length"],bin=25);
sns.FacetGrid(dataset,hue="Species",size=6).map(sns.displot,"Sepal.Width").add_legend();
sns.boxplot(x='Species',y='Petal.Length',data=dataset)
#Preprocessing
fromsklearn.preprocessing import StandardScaler
ss=StandardScaler()
x=dataset.drop(['Species'],axis=1)
y=dataset['Species']
scaler=ss.fit(x)
x_stdscaler=scaler.transform(x)
x_stdscaler
fromsklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
#Splitting
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_train.value_counts
#Model Selection
from sklearn.svm
import SVC
svc=SVC(kernel="linear")
svc.fit(x_train,y_train)
y_pred=svc.predict(x_test)
y_pred
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)
#Prediction
from sklearn.neighbors
import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
KNeighborsClassifier(n_neighbors=3)
y_pred=knn.predict(x_test)
accuracy_score(y_pred,y_test)
OUTPUT:
Dataset Heads:
Dataset unique:
array(['setosa', 'versicolor', 'virginica'], dtype=object)
Dataset description:
Dataset correlation
Scatter Plot
Pair plot
Histogram
Box Plot
Preprocessing
array([[-1.72054204e+00, -9.00681170e-01, 1.01900435e+00,
-1.34022653e+00, -1.31544430e+00],
[-1.69744751e+00, -1.14301691e+00, -1.31979479e-01,
-1.34022653e+00, -1.31544430e+00],
[-1.67435299e+00, -1.38535265e+00, 3.28414053e-01,
-1.39706395e+00, -1.31544430e+00],
[-1.65125846e+00, -1.50652052e+00, 9.82172869e-02,
-1.28338910e+00, -1.31544430e+00],
[-1.58197489e+00, -1.50652052e+00, 7.88807586e-01,
[-2.42492502e-01, -2.94841818e-01, -3.62176246e-01,
7.62758269e-01, 7.90670654e-01]])
Splitting
bound method DataFrame.value_counts of Unnamed: 0
Sepal.LengthSepal.WidthPetal.LengthPetal.Width
81 82 5.5 2.4 3.7 1.0
133 134 6.3 2.8 5.1 1.5
137 138 6.4 3.1 5.5 1.8
75 76 6.6 3.0 4.4 1.4
109 110 7.2 3.6 6.1 2.5
.. ... ... ... ... ...
71 72 6.1 2.8 4.0 1.3
106 107 4.9 2.5 4.5 1.7
14 15 5.8 4.0 1.2 0.2
92 93 5.8 2.6 4.0 1.2
102 103 7.1 3.0 5.9 2.1
Program:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("C:\\Users \\Desktop \\FDS LAb\\diabetes_csv.csv")
df.head()
df.skin.value_counts()
df.mean(axis = 0)
print(df.loc[:,'skin'].mean())
df.mean(axis = 1)[0:5]
df.median()
print(df.loc[:,'skin'].median())
df.median(axis = 1)[0:5]
df.mode()
df.std()
print(df.loc[:,'skin'].std())
df.std(axis = 1)[0:5]
df.var()
print(df.skew())
df.describe()
df.describe(include='all')
print(df.kurtosis())
norm_data = pd.DataFrame(np.random.normal(size=100000))
norm_data.plot(kind="density", figsize=(10,10));
# Plot black line at mean
plt.vlines(norm_data.mean(), ymin=0, ymax=0.4,linewidth=5.0);
# Plot red line at median
plt.vlines(norm_data.median(), ymin=0, ymax=0.4, linewidth=2.0, color="red");
Output:
Head Datas:
Frequency:
0 227
32 31
30 27
27 23
23 22
33 20
28 20
18 20
31 19
19 18
39 18
29 17
40 16
25 16
Mean:
20.536458333333332
0 43.153375
1 29.868875
2 38.871500
3 40.283375
4 57.298500
dtype: float64
Mode:
Median:
23.0
0 34.30
1 27.80
2 15.65
3 25.55
4 37.50
dtype: float64
Standard Deviation:
15.952217567727677
0 49.397286
1 31.519803
2 62.253392
3 37.591100
4 61.533847
dtype: float64
Variance:
preg 11.354056
plas 1022.248314
pres 374.647271
skin 254.473245
insu 13281.180078
mass 62.159984
pedi 0.109779
age 138.303046
dtype: float64
Skewness:
preg 0.901674
plas 0.173754
pres -1.843608
skin 0.109372
insu 2.272251
dtype: float64
Kurtosis:
preg 0.159220
plas 0.640780
pres 5.180157
skin -0.520072
insu 7.214260
mass 3.290443
pedi 5.594954
age 0.643159
dtype: float64
Graph:
Ex No: 5.a(2)
Univariate analysis for Pima Indians Diabetes data set
Program:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("C:\\Users \\Desktop\\FDS LAb\\pima-indians-diabetes.csv")
df.head()
df.mean(axis = 0)
print(df.loc[:,'35'].mean())
df.mean(axis = 1)[0:5]
df.median()
print(df.loc[:,'33.6'].median())
df.median(axis = 1)[0:5]
df.mode()
df.std()
print(df.loc[:,'35'].std())
df.std(axis = 1)[0:5]
df.var()
print(df.skew())
print(df.kurtosis())
norm_data = pd.DataFrame(np.random.normal(size=100000))
norm_data.plot(kind="density",figsize=(10,10));
# Plot black line at mean
plt.vlines(norm_data.mean(),ymin=0, ymax=0.4,linewidth=5.0);
# Plot red line at median
plt.vlines(norm_data.median(), ymin=0, ymax=0.4, linewidth=2.0,color="red");
Output:
Head Datas:
0 1 85 66 29 0 26.6 0.351 31 0
2 1 89 66 23 94 28.1 0.167 21 0
0 26.550111
1 34.663556
2 35.807444
3 51.043111
4 27.866778
dtype: float64
Mode:
Median:
32.0
0 26.6
1 8.0
2 23.0
3 35.0
4 5.0
dtype: float64
Standard Deviation:
15.954059060433842
0 31.119744
1 59.585320
2 37.639873
3 60.541569
4 41.114755
dtype: float64
Variance:
6 11.362809
148 1022.622445
72 375.125415
35 254.532001
0 13290.194335
33.6 62.237755
0.627 0.109890
50 138.116452
1 0.227226
dtype: float64
Skewness:
6 0.903976
148 0.176412
72 -1.841911
35 0.112058
0 2.270630
33.6 -0.427950
0.627 1.921190
50 1.135165
1 0.638949
dtype: float64
Kurtosis:
6 0.161293
148 0.642992
72 5.168578
35 -0.518325
0 7.205266
33.6 3.282498
0.627 5.593374
50 0.660872
1 -1.595913
dtype: float64
Graph:
Ex No: 5b1
Bivariate analysis: Linear and logistic regression modeling
Program
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline
diabetes=pd.read_csv("C:\\Users \\Desktop \\FDS LAb\\diabetes.csv")
diabetes.head()
diabetes = datasets.load_diabetes()
diabetes
print(diabetes.DESCR)
diabetes.feature_names
# Now we will split the data into the independent and independent variable
X = diabetes.data[:,np.newaxis,3]
Y = diabetes.target
#We will split the data into training and testing data
from sklearn.model_selection
import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
# Linear Regression
From sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(x_train,y_train)
y_pred = reg.predict(x_test)
Coef=reg.coef_
print(Coef)
from sklearn.metrics import mean_squared_error, r2_score
MSE=mean_squared_error(y_test,y_pred)
R2=r2_score(y_test,y_pred)
print(R2,MSE)
frommatplotlib.pyplot import *
importmatplotlib.pyplot as plt
plt.scatter(y_pred, y_test)
plt.title('Predicted data vs Real Data')
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()
plt.scatter(x_test, y_test)
plt.plot(x_test,y_pred,linewidth=2)
plt.title('Linear Regression')
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()
model = LogisticRegression()
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
model_score = model.score(x_test,y_test)
print(model_score)
print(metrics.confusion_matrix(y_test, y_predict))
Output:
Diabetes Description
Diabetes dataset
----------------
Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum
measurements were obtained for each of n =442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.
Coefficient Value
[731.87600042]
0.007518796992481203
[[130 17]
[ 38 46]]
Ex No: 5b2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline
diabetes=pd.read_csv("C:\\Users\\Desktop \\FDS LAb\\pima-indians-diabetes.csv")
diabetes.head()
diabetes = datasets.load_diabetes()
diabetes print(diabetes.DESCR)
diabetes.feature_names
# Now we will split the data into the independent and independent variable
X = diabetes.data[:,np.newaxis,3]
Y = diabetes.target
#We will split the data into training and testing data
from sklearn.model_selection
import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
# Linear Regression
from sklearn.linear_model
import LinearRegression
reg=LinearRegression()
reg.fit(x_train,y_train)
y_pred = reg.predict(x_test)
Coef=reg.coef_
print(Coef)
from sklearn.metrics
import mean_squared_error, r2_score
MSE=mean_squared_error(y_test,y_pred)
R2=r2_score(y_test,y_pred)
print(R2,MSE)
from matplotlib.pyplot
import *
import matplotlib.pyplot as plt
plt.scatter(y_pred, y_test)
plt.title('Predicted data vs Real Data')
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()
plt.scatter(x_test, y_test)
plt.plot(x_test,y_pred,linewidth=2)
plt.title('Linear Regression')
plt.xlabel('y_pred')
plt.ylabel('y_test')
plt.show()
model = LogisticRegression()
model.fit(x_train,y_train)
y_predict=model.predict(x_test)
model_score = model.score(x_test,y_test)
print(model_score)
print(metrics.confusion_matrix(y_test, y_predict))
Output:
Diabetes Description
Diabetes dataset
----------------
Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum
measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.
Coefficient Value
[692.2463534]
Mean Square Error and R2 value
0.22801179880891975 4213.0324099222125
Predicted data vs Real Data
Linear Regression
0.00685448922416352
[[113 11]
[ 28 38]]
Ex No: 5c1
Multiple regression analysis
Program:
import numpy as np
import matplot lib.pyplot as plt
import pandas as pd
from sklearn import datasets
%matplot lib inline
diabetes=pd.read_csv("C:\\Users \\Desktop \\FDS LAb\\diabetes.csv")
diabetes.head()
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
X = diabetes[["Age", "BMI"]]## the input variables
y = diabetes["Glucose"] ## the output variables, the one you want to predict
X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model
# Note the difference in argument order
model2 = sm.OLS(y, X).fit()
predictions = model2.predict(X) # make the predictions by the model
# Print out the statistics
model2.summary()
Output:
Head data’s:
Blood Skin
Pregnanci Glucos Insuli BM DiabetesPedigreeFunc Ag Outco
Pressu Thickne
es e n I tion e me
re ss
1 1 85 66 29 0 26.6 0.351 31 0
Blood Skin
Pregnanci Glucos Insuli BM DiabetesPedigreeFunc Ag Outco
Pressu Thickne
es e n I tion e me
re ss
3 1 89 66 23 94 28.1 0.167 21 0
Statistics:
Df Model: 2
3 5 88 66 21 23 24.4 0.342 30 0
Statistics:
Df Model: 2
Program
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
%matplotlib inline
importseaborn as sns
importmatplotlib.pyplot as plt
df=pd.read_csv("C:\\Users\\Desktop\\FDS LAb\\train.csv")
df.head()
sns.distplot(df["Fare"])
sns.distplot(df["Age"])
plt.contour(df[["Fare","Parch"]])
Output:
Density Plot:
Contour Plot
EX.No : 6c
Apply and explore Correlation and Scatter plotting functions on UCI data sets
Program:
import numpy as np
import pandas as pd
import seaborn as sn
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
df=pd.read_csv("C:\\Users\\JP\\Desktop\\SBECW\\FDS LAb\\train.csv")
df.head()
plt.figure(figsize=(8,8))
sn.scatterplot(x="Age", y="Fare", hue="Sex", data=df)
plt.show()
df.corr()
# plotting correlation heatmap
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True)
# displaying heatmap
plt.show()
Output
Scatter Plot
Heap Map
EX.No : 6d
Apply and explore histogram plotting functions on UCI data sets.
Program
importnumpy as np
import pandas as pd
importseaborn as sn
%matplotlib inline
importseaborn as sns
importmatplotlib.pyplot as plt
df=pd.read_csv("C:\\Users \\Desktop \\FDS LAb\\train.csv")
df.head()
plt.hist(df["Fare"])
Output:
Histogram :
array([732., 106., 31., 2., 11., 6., 0., 0., 0., 3.]),
array([ 0. , 51.23292, 102.46584, 153.69876, 204.93168, 256.1646 ,
307.39752, 358.63044, 409.86336, 461.09628, 512.3292 ]),
<BarContainer object of 10 artists>)
EX .No : 6E
Apply and explore three dimensional plotting functions on UCI data sets.
Program
importnumpy as np
import pandas as pd
importseaborn as sn
%matplotlib inline
importseaborn as sns
importmatplotlib.pyplot as plt
frommpl_toolkits import mplot3d
df=pd.read_csv("C:\\Users \\Desktop\\FDS LAb\\train.csv")
df.head()
%matplotlib inline
fig = plt.figure(figsize=(8,8))
ax = plt.axes(projection='3d')
ax = plt.axes(projection='3d')
zline = np.linspace(0, 15, 1000)
xline = np.sin(zline)
yline = np.cos(zline)
ax.plot3D(xline, yline, zline, 'gray')
zdata = df[["Fare"]]
xdata = df[["Age"]]
ydata = df[["Parch"]]
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='Greens');
Output
Three Dimensional Lines
OUTPUT:
Ortho Projection
Pseudo-cylindrical projections
Perspective projection
Conic projection