Pandas DataFrame
Pandas DataFrame
Introduction
• Python Pandas is a software library written for the Python programming language for
data manipulation and analysis regardless of the origin of the data.
• Pandas is defined as an open-source library that provides high-performance data
manipulation in Python.
• The name of Pandas is derived from the word Panel Data.
• It is developed by Wes McKinney in 2008.
• Using the Pandas, we can accomplish five typical steps- load, prepare, manipulate,
model and analyse
Benefits of Pandas
• It can easily represent data in a form naturally suited for data
analysis
• It provides clear code to focus on the core part of the code.
Made by PGT Comp Sc. Ms. Puja Gupta
Data structures in Pandas
Data structure is defined as the storage and management of the data for its
efficient and easy access in the future where the data is collected, modified and
the various types of operations are performed on the data respectively.
import pandas as pd
df=pd.DataFrame()
#pd.DataFrame(None)
print(df)
import pandas as pd
Dic={'roll':[1,2,3],'name':('a','b','c'),'marks':(24,53,66)}
df=pd.DataFrame(Dic)
print(df) Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Creating From Dictionary with custom index
import pandas as pd
Dic={'roll':[1,2,3],'name':('a','b','c'),'marks':(24,53,66)}
df=pd.DataFrame(Dic,index=[11,12,13])
# or df=pd.DataFrame(index=[11,12,13],data=Dic)
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Creating From nested list (2D List)
import pandas as pd
l=[['eng',101],['chem',99],['comp',100]]
df=pd.DataFrame(l )
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Creating From nested list (2D List) with labelled indexes
import pandas as pd
l=[['eng',101],['chem',99],['comp',100]]
df=pd.DataFrame(l,index=list(range(5,20,5)),columns=['sname','scode'])
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Add a column (always at the end)
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
df['UT1']=[12,13,14]
df['UT2']=df['UT1']+5
df['UT3']=df['UT1']+df['UT2']
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Add a column using insert() method
df.insert(loc, column, value, allow_duplicates = False) loc is an integer which is the location of column where we
want to insert new column. This will shift the existing column at that position to the right.
import pandas as pd
dict1={"Name":["sanah",'chavi','suditi'],"PB1":[78,88,98],"PB2":[87,93,97]}
df=pd.DataFrame(dict1,index=['a','b','c'])
print(df)
df.insert(2,'age',[1,2,3],allow_duplicates=True)
#2 is index location at which field age will be inserted
#and if ‘age’ field already exists then allow_duplicates will permit
Columns names
print(df)
Row index
Before Made by PGT Comp Sc. Ms. Puja Gupta
After insertion
DataFrame- Add a Row
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
df.loc[len(df)]=['raji',100]
# df.loc[3]=[ 'raji',100]
# if given an already existing id then will replace
print(df) Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Drop or Delete a Row
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
df=df.drop(1,axis=0)
# by default axis is 0, so if not given any axis it will be 0
#df.drop(1,axis=0,inplace=True) will make changes in df
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Drop or Delete a Column
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
df=df.drop(‘name’,axis=1)
#df.drop(‘name’,axis=1,inplace=True)
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Accessing Elements from a DataFrame on the basis of some condition
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df.loc[df['name']=='puja'])
#print(df[df['name']=='puja'])
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Accessing Elements from a DataFrame on the basis of JUST
condition, answer will be Boolean values
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df['name']=='puja')
#print(df['name']=='puja')
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Accessing Elements from a DataFrame on the basis of some condition
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df.loc[df['marks']>=80])
#print(df[df['marks']>=80])
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Accessing Elements from a DataFrame on the basis of some condition
Showing few columns
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df.loc[df['name']=='puja']['name'])
#single column
print(df.loc[df['name']=='puja'][['name','marks']])
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df.loc[df['marks']>=80,'marks'])
print(df.loc[df['marks']>=80,['marks','name']])
print(df.loc[df['marks']>=80,:])
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- Drop or Delete a Row with some condition
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
ind=df.loc[df['name']=='puja'].index
print(ind)
df=df.drop(ind,axis=0)
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Selecting a column
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df.loc[:,:]) # all rows and columns
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
print(df.iloc[:,:])
print(df.iloc[0:1,0:1])
print(df.iloc[[0,2],[0,1]])
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- Rename column names(few/all)/ Row index(few/all)
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
df=df.rename(columns={'name':'Child Name'},index={0:'stud1'})
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- Rename column names(all)/ Row index(all)
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
df.columns=['Child Name','marks']
df.index=['stud1','stud2','stud3']
print(df)
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DATA
DataFrame- head() and tail() function
head()-By default DataFrame.head() function display top 5 rows. To print n no of top rows, pass n as
parameter i.e. DataFrame. head(n)
tail()-By default DataFrame.tail() function display last 5 rows. To print n no of last rows, pass n as parameter
i.e. DataFrame. tail(n)
import pandas as pd
d={'name':['a','b','c','d','e','f'], 'marks':[1,2,3,4,5,6]}
df=pd.DataFrame(d)
print(df.head())
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- head() and tail() function
head()-with negative argument(-n) : Total no of rows-n=head(ans)
tail()-with negative argument(-n) : Total no of rows-n=tail(ans)
import pandas as pd
d={'name':['a','b','c','d','e','f'], 'marks':[1,2,3,4,5,6]}
df=pd.DataFrame(d)
print(df.head(-4))
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- head() and tail() function
head()-By default DataFrame.head() function display top 5 rows. To print n no of top rows, pass n as
parameter i.e. DataFrame. head(n)
tail()-By default DataFrame.tail() function display last 5 rows. To print n no of last rows, pass n as parameter
i.e. DataFrame. tail(n)
import pandas as pd
d={'name':['a','b','c','d','e','f'], 'marks':[1,2,3,4,5,6]}
df=pd.DataFrame(d)
print(df.tail())
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- head() and tail() function
head()-with negative argument(-n) : Total no of rows-n=head(ans)
tail()-with negative argument(-n) : Total no of rows-n=tail(ans)
import pandas as pd
d={'name':['a','b','c','d','e','f'], 'marks':[1,2,3,4,5,6]}
df=pd.DataFrame(d)
print(df.tail(-4))
Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- modifying/accessing a single cell
import pandas as pd
d={'name':['a','b','c','d','e','f'], 'marks':[1,2,3,4,5,6]}
df.loc[0,'name']='puja gupta'
df.at[1,'name']='aadya gupta'
df.iloc[0,1]=7
df.iat[1,1]=8 Columns names
print(df)
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- Attributes
1.DataFrame.index- to display row labels
FArea={'Pre Board 1' :[10,11,12],"Pre Board 2":[20,21,22]}
df1=pd.DataFrame(FArea,index=['a','b','c'])
print(df1)
print(df1.index)
8. DataFrame.axes - to return a list representing both the axes(axis 0 (row-index) axis 1 (columns))
print(df.axes)
df=df.drop(0,axis=0)
df=df.drop([1,2],axis=0)
print(df) Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- Deleting a Single Column/more than 1 column
import pandas as pd
d={'name':['puja','aadi','srish','raji'], 'marks':[77,88,99,86]}
df=pd.DataFrame(d)
df=df.drop('name',axis=1)
#df=df.drop(['name','marks'],axis=1)
print(df) Columns names
Row index
Made by PGT Comp Sc. Ms. Puja Gupta
DataFrame- Iterating row-wise
• iterrows() : In order to iterate over rows, we apply a iterrows() function this function return
each index value along with a series containing the data in each row. Now we apply
iterrows() function in order to get a each element of rows. It gives u horizontal subset and
gives u (row-index, Series)
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
for a,b in df.iterrows():
print(a)#row index
print(b)#row with series dtype
import pandas as pd
d={'name':['puja','aadi','srish'], 'marks':[77,88,99]}
df=pd.DataFrame(d)
for a,b in df.iteritems():
print(a)#column name
print(b)#column values with series dtype
import pandas as pd
df=pd.DataFrame([1,2,3,4,5],index=[True,False,True,False,True])
#It divides dataframe into 2 groups-True rows and False rows
print(df.loc[True])
print(df.loc[False])
import pandas as pd
d={'name':['puja','aadi','srish','raji'], 'marks':[77,88,99,86]}
df=pd.DataFrame(d)
df1=df.sort_values(by='marks',axis=0,inplace=False)
print(df1)
Columns names
Row index
import pandas as pd
d={'name':['puja','aadi','srish','raji'], 'marks':[77,88,99,86]}
df=pd.DataFrame(d)
df1=df.sort_values(by='marks',axis=0,inplace=False,ascending=False)
print(df1)
Columns names
Row index
Columns names
Row index
Columns names
Row index
Row index
Row index
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.min(0))# axis=0 default does for each column
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.min(1))# axis=1 default does for each row
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.max(0))# axis=0 default does for each column
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.max(1))# axis=1 default does for each row
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.count(0))# axis=0 default does for each column
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.count(1))# axis=1 default does for each row
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.isnull())
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.dropna(axis=0))
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.dropna(axis=1))
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,\
np.NaN,88],'subject':["Acct",'Eco','Eng','IP']})
print(df.fillna(33))
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,
np.NaN,88]})
print(df.idxmax(axis=1))
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,
np.NaN,88]})
print(df.idxmax(axis=0))
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,
np.NaN,88]})
print(df.idxmin(axis=1))
import pandas as pd
import numpy as np
df=pd.DataFrame({"Benjamin":[99,90,95,94],"Krishna":[94,89,
np.NaN,88]})
print(df.idxmin(axis=0))
•To create DataFrame from more than one ndarray (numpy array)
a1=np.array([10,20,30])
a2=np.array(["A","b","c"])
df1=pd.DataFrame([a1,a2],index=["ayush","samudra"],columns=['m1','m2','m3'])