0% found this document useful (0 votes)
15 views

Data Science Practical Book - Ipynb

Uploaded by

Tejas Tadka
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views

Data Science Practical Book - Ipynb

Uploaded by

Tejas Tadka
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 21

Data Science Practical Notebook

T.Y.B.Sc(Computer Science)
CS 358 : Data Science Practicals

 Assignment 1 : The Data Science Environment

 Assignment 2 : Statistical Data Analysis

 Assignment 3 : Data Preprocessing

 Assignment 4 : Data Visualization


ASSIGNMENT 1 : THE DATA SCIENCE ENVIRONMENT
SET A

1.

Create and view a data frame


#import the library
import pandas as pd
import numpy as np
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Percentage' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame

index Name Age Percentage


0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21

2.

#print shape >> number of rows - columns


print("Size={}\n Shape={}\nNumber of rows={}\nNumber of Columns={}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
print("\n Feature Names = {}, {}, {}".
format(data.columns[0], data.columns[1], data.columns[2]))

Size = 30
Shape = (10, 3)
Number of rows = 10
Number of Columns = 3
Feature Names = Name, Age, Percentage
3.

Adding 5 rows and 1 column


data.loc[10] = ['K',21,56 ]
data.loc[11] = ['L',21,None]
data.loc[12] = ['M',None, 45]
data.loc[13] = ['K',21,56]
data.loc[14] = ['O',25,84]
data["Remarks"] = None
data
index Name Age Percentage Remarks
0 A 26 56 null
1 B 28 62 null
2 C 20 42 null
3 D 15 74 null
4 E 20 32 null
5 F 16 63 null
6 G 18 74 null
7 H 17 84 null
8 I 22 96 null
9 J 21 21 null
10 K 21 56 null
11 L 21 null null
12 M null 45 null
13 K 21 56 null
14 O 25 84 null

4.

print("Number of Observations = ", len(data.index))


print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(data.duplicated().value_counts() #number of duplicate values
Number of Observations = 15

Total missing values in a DataFrame :

17

Total missing values in a DataFrame :

17
False 14
True 1
dtype: int64
5.

Removing a column and missing values


data2=data.drop(columns='Remarks')
data2=data2.dropna(axis=0)
#print modified data
data2
index Name Age Percentage
0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21
10 K 21 56
13 K 21 56
14 O 25 84

6.

Scatterplot
data2.plot.scatter(x='Name',y='Percentage',
title = "Scatterplot")
plt.show()

SET B

1.
import pandas as pd
data=pd.read_csv('SOCR-HeightWeight.csv')
data.tail(10) #print last 10 rows
data.sample(20) #print 20 random rows
data.head(10) #print first 10 rows
index Height(Inches) Weight(Pounds)
0 65.78331 112.9925
1 71.51521 136.4873
2 69.39874 153.0269
3 68.2166 142.3354
4 67.78781 144.2971
5 68.69784 123.3024
6 69.80204 141.4947
7 70.01472 136.4623
8 67.90265 112.3723
9 66.78236 120.6672

2.

Add column "BMI"


data2=data.assign(BMI=data['Weight(Pounds)']/(data['Height(Inches)']*
data['Height(Inches)']))

3.

print("\n Maximum BMI = ",max(data2['BMI']))


print("\n Minimum BMI = ",min(data2['BMI']))

Maximum BMI = 0.03701443692089851

Minimum BMI = 0.018591137267932455


ASSIGNMENT 2 : STATISTICAL DATA ANALYSIS
SET A :

1.
import numpy as np

#Inserting the two data points

a=np.array((2,3))

b=np.array((4,5))

#Euclidean Distance

print("Euclidean Distance = ", np.linalg.norm(a-b))

Euclidean Distance = 2.8284271247461903

2.
Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Scores' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
print(data) #To view the data frame
print("\n Mean Score = ",s.tmean(data["Scores"]) )
print("\n Maximum = ",max(data["Scores"]))
print("\n Minimum = ",min(data["Scores"]))
print("\n Range = ",
max(data["Scores"]) - min(data["Scores"]) )
q3,q1 = np.percentile(data["Scores"],[75,25])
print("\n Q3 = ", q3)
print("\n Q1 = ", q1)
print("\n IQR = ", q3 - q1)
Name Scores
0 A 56
1 B 62
2 C 42
3 D 74
4 E 32
5 F 63
6 G 74
7 H 84
8 I 96
9 J 21

Mean Score = 60.4

Maximum = 96

Minimum = 21

Range = 75

Q3 = 74.0

Q1 = 45.5

IQR = 28.5

3.

Program to find Manhattan distance between all pairs of points


import math
def manhattan(a,b,n):
sum = 0
i = 0
for i in range(n):
sum += abs(a[i]-b[i])
return sum

a=[3,5,5,6,5,4,3]
b=[-2,3,2,-5,2,3,-1]

n=len(a) #or len(b)


print("Manhattan Distance = ", manhattan(a,b,n))

Manhattan Distance = 29

SET B

1.

data=pd.read_csv('iris.csv')
print("Number of records for different variety/class attribute \n")
data['variety'].value_counts()
Number of records for different variety/class attribute

Versicolor 50
Setosa 50
Virginica 50
Name: variety, dtype: int64

2.

import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Iris Dataset : Column wise Mean and Median \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print('\t Median = %.2f' % data[col].median())

Iris Dataset : Column wise Mean and Median

sepal.length:
Mean = 5.84
Median = 5.80
sepal.width:
Mean = 3.06
Median = 3.00
petal.length:
Mean = 3.76
Median = 4.35
petal.width:
Mean = 1.20
Median = 1.30

SET C :

1.

Program to find Minkowskii Distance between two points


from math import *
from decimal import Decimal
def nth_root(value,root):
root_value = 1/float(root)
return round(Decimal(value)**
Decimal(root_value),3)
def minkowski(a,b,n):
return(nth_root(sum(pow(abs(i-j),n)
for i,j in zip(a,b)),n))

a=[-1,5]
b=[2,4]
n=len(a) #OR root value
print("\n Minkowski Distance = ",minkowski(a,b,n))
Minkowski Distance = 3.162
2.

import numpy as np
x = np.array([0, 1, 3])
y = np.array([2, 4, 5])
print("\nOriginal array1:")
print(x)
print("\nOriginal array1:")
print(y)
print("\nCross-correlation of the said arrays:\n",np.cov(x, y))
Original array1:
[0 1 3]

Original array1:
[2 4 5]

Cross-correlation of the said arrays:


[[2.33333333 2.16666667]
[2.16666667 2.33333333]]

3.

Create and view a data frame


#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Student' : ["1","2","3","4","5","6","7","8","9","10"],
'Subject 1':[41,62,35,15,21,65,84,75,42,95],
'Subject 2' : [56,62,42,74,32,63,74,84,96,21],
'Subject 3' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Subject 4' : [41,75,84,62,13,56,42,84,95,23],
'Subject 5' : [45,74,62,31,21,54,45,86,95,32]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
Index Student Subject 1 Subject 2 Subject 3 Subject 4 Subject 5
0 1 41 56 26 41 45
1 2 62 62 28 75 74
2 3 35 42 20 84 62
3 4 15 74 15 62 31
4 5 21 32 20 13 21
5 6 65 63 16 56 54
6 7 84 74 18 42 45
7 8 75 84 17 84 86
8 9 42 96 22 95 95
9 10 95 21 21 23 32
from pandas.api.types import is_numeric_dtype
from scipy.stats.mstats import gmean
import statistics as stat
print("Subject wise Mean \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Arithmetic Mean = %.2f' % data[col].mean())
print('\t Geometric Mean = %.2f' % gmean(data[col]))
print('\t Harmonic Mean = %.2f' % stat.harmonic_mean(data[col]))
Subject wise Mean

Subject 1:
Arithmetic Mean = 53.50
Geometric Mean = 46.35
Harmonic Mean = 38.71
Subject 2:
Arithmetic Mean = 60.40
Geometric Mean = 55.41
Harmonic Mean = 49.53
Subject 3:
Arithmetic Mean = 20.30
Geometric Mean = 19.93
Harmonic Mean = 19.58
Subject 4:
Arithmetic Mean = 57.50
Geometric Mean = 49.59
Harmonic Mean = 39.96
Subject 5:
Arithmetic Mean = 54.50
Geometric Mean = 49.33
Harmonic Mean = 44.27
ASSIGNMENT 3 : DATA PREPROCESSING
SET A

1.

import pandas as pd
import io
data = pd.read_csv('Data.csv',sep = ',')
data
index Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
a.

data.describe()
index Age Salary
count 9.0 9.0
mean 38.77777777777778 63777.77777777778
std 7.693792591722527 12265.579661982732
min 27.0 48000.0
25% 35.0 54000.0
50% 38.0 61000.0
75% 44.0 72000.0
max 50.0 83000.0

b.

print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows


= {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))

Size = 40
Shape of DataFrame Object = (10, 4)
Number of rows = 10
Number of Columns = 4

c.

print("\n first 3 rows from Dataset")


data.head(3)
First 3 rows from dataset

index Country Age Salary Purchased


0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No

2. a.

Applying OneHot Encoding on Country Column


from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_data= pd.DataFrame(enc.fit_transform(data[['Country']]).toarray()
)
enc_data

index 0 1 2 3
0 1.0 0.0 0.0 0.0
1 0.0 0.0 0.0 1.0
2 0.0 0.0 1.0 0.0
3 0.0 0.0 0.0 1.0
4 0.0 0.0 1.0 0.0
5 1.0 0.0 0.0 0.0
6 0.0 0.0 0.0 1.0
7 0.0 1.0 0.0 0.0
8 0.0 0.0 1.0 0.0
9 1.0 0.0 0.0 0.0

data_merge= data.join(enc_data)
data_merge

index Country Age Salary Purchased 0 1 2 3


0 France 44.0 72000.0 No 1.0 0.0 0.0 0.0
1 Spain 27.0 48000.0 Yes 0.0 0.0 0.0 1.0
2 Germany 30.0 54000.0 No 0.0 0.0 1.0 0.0
3 Spain 38.0 61000.0 No 0.0 0.0 0.0 1.0
4 Germany 40.0 NaN Yes 0.0 0.0 1.0 0.0
5 France 35.0 58000.0 Yes 1.0 0.0 0.0 0.0
6 Spain NaN 52000.0 No 0.0 0.0 0.0 1.0
7 France 48.0 79000.0 Yes 0.0 1.0 0.0 0.0
8 Germany 50.0 83000.0 No 0.0 0.0 1.0 0.0
9 France 37.0 67000.0 Yes 1.0 0.0 0.0 0.0

b.
Applying label encoding on purchased column
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data['Purchased'] = labelencoder.fit_transform(data['Purchased'])
data

index Country Age Salary Purchased


0 France 44.0 72000.0 0
1 Spain 27.0 48000.0 1
2 Germany 30.0 54000.0 0
3 Spain 38.0 61000.0 0
4 Germany 40.0 NaN 1
5 France 35.0 58000.0 1
6 Spain NaN 52000.0 0
7 France 48.0 79000.0 1
8 Germany 50.0 83000.0 0
9 France 37.0 67000.0 1
#The purchased labels are replaces by numbers 0 and 1,where 'No' is
assigned 0, and 'Yes' is assigned 1.

SET B

1.

# Rescaling Data
import pandas, scipy, numpy
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
array=data.values
#Separating data into input and output components
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(array)
print("\n Min Max Scaled Data \n \n ")
print(data_scaled.round(3))

Min Max Scaled Data

[[0.248 ………………………………, 0.4]


………………………………………………,
[0.124 ………………………, 0.4 0.6]]
2.

# Standardizing Data
from sklearn.preprocessing import StandardScaler
import scipy.stats as s
scaler=StandardScaler().fit(data)
std_data=scaler.transform(data)
print("\n Standardized Data \n ")
print(std_data)
print("\n Standardized Mean : ",s.tmean(std_data).round(2))
print(" Standardized Standard Deviation : ",round(std_data.std(),2))

Standardized Data

[[-0.528 ………………………… ]
[………………………………,
[………………………………………, 0.45084835]]

Standardized Mean : 0.0


Standardized Standard Deviation : 1.0

3.

# Normalizing Data
import numpy as np
import pandas as pd
import scipy.stats as s
from sklearn import preprocessing
norm_data=preprocessing.normalize(data,norm='l1')
print("\n Normalized Data \n ")
norm_data

Normalized Data

array([[0.099…………………………………….],
[………………………………………….., 0.06487013]])

4.

# Binarizing Data
binarized_data=preprocessing.Binarizer(threshold=0.0).fit(data).trans
form(data)
print("\n Binarized Data \n ")
binarized_data

Binarized Data

array([[1., 1., 0., ..., 1., 1., 1.],


[1., …………......, 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.]])
SET C

1.

import pandas as pd
import io
data= pd.read_csv('Student_bucketing.csv')
data=pd.DataFrame(data)
data['bucket']=pd.cut(data['marks'],5,
labels=['Poor','Below_average','Average','Above_average
','Excellent'])
data.head(10)

index Student_id Age Grade Employed marks bucket


0 1 19 1st Class yes 29 Poor
1 2 20 2nd Class no 41 Below_average
2 3 18 1st Class no 57 Average
3 4 21 2nd Class no 29 Poor
4 5 19 1st Class no 57 Average
5 6 20 2nd Class yes 53 Average
6 7 19 3rd Class yes 78 Above_average
7 8 21 3rd Class yes 70 Above_average
8 9 22 3rd Class yes 97 Excellent
9 10 21 1st Class no 58 Average
ASSGNMENT 4 : DATA VISUALIZATION
SET A

1.

from matplotlib import pyplot as plt


import numpy as np
# generate random array using NumPy
a1 = np.random.randn(50)
a2 = np.random.randn(50)
plt.plot(a1,color="k",linewidth=1,linestyle=':')
plt.title("Line Chart")
plt.show()

plt.scatter(a1,a2,c=np.random.randn(50) ,marker ='*',alpha = 0.9)


plt.title("Scatter Plot")
plt.show()

plt.hist(a2,bins=15,facecolor ='lawngreen',edgecolor = "k",alpha=0.7)


print("Histogram")
Histogram
box=plt.boxplot(a2,vert=False,patch_artist = True)
print("Boxplot")

2.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
sns.countplot(x='variety',data = data)
plt.title("Iris Species Count")
plt.show()
3.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
ax=plt.subplots(1,1,figsize=(10,8))
data['variety'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct
='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Iris Species %")
plt.show()

4.

import seaborn as sns


iris_setosa=data.loc[data["variety"]=="Setosa"]
iris_virginica=data.loc[data["variety"]=="Virginica"]
iris_versicolor=data.loc[data["variety"]=="Versicolor"]

sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.width").add
_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.width").add
_legend()
plt.show()
SET B

1.

import seaborn as sns


import matplotlib.pyplot as plt

def graph(a):
sns.boxplot(x="variety", y=a, data=data)

plt.figure(figsize=(10,10))

plt.subplot(221)
graph('sepal.length')

plt.subplot(222)
graph('sepal.width')

plt.subplot(223)
graph('petal.length')

plt.subplot(224)
graph('petal.width')

plt.show()
SET C

1.

#Plot to compare all features of iris dataset


import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data,hue='variety', height=2)
plt.show()

s
2.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
g = sns.jointplot(x="sepal.length", y="sepal.width",shade=True, data=
data, kind="kde", color="b")
g.plot_joint(plt.scatter, c="gold", s=40, linewidth=1, marker="*")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$SepalLength$", "$SepalWidth$")
plt.show()

You might also like