Text data - str
integers - int
decimals - float
binary - bool
dates - datetime
categories - category
# removing $ from revenue column and converting column to int
#step 1
df["Revenue"] = df["Revenue"].str.strip('$')
#step 2
df["Revenue"] = df["Revenue"].astype('int')
or if it is float
df["Revenue"] = df["Revenue"].astype('float')
# after this you can sum the column
assert df["Revenue"].dtype == 'int' # This will tell if the column is rightly
converted or not, if it fine then you can sum the column
Total_Revenue = df["Revenue"].sum()
#assert function helps to check if the conversion happened correctly
assert df["revenue"].dtype = 'int'
#if no error then we are good
# categorical variables
df["status"] = df["status"].astype('category')
# outlier treatment example
df.loc[df['avg_rating'] > 5, 'avg_rating'] = 5 #convert avg_rating > 5 to 5
#asserting the above effort is correct or not
assert df['avg_rating'].max() <= 5
import datetime as dt
#coverting datecolumn to date type
df["subscription_date"] = pd.to_datetime(df["subscription_date"]).dt.date
assert df["subscription_date"].dtype = 'datetime'
#check if the above data cleaning is correct
today_date = dt.date.today()
assert df["subscription_date"].max().date() <= today_date
# if not then
df.loc[df["subscription_date"] > today_date,"subscription_date"] = today_date #
forced cleaning of the data
assert df["subscription_date"].max().date <= today_date
# duplicate values
column_names = ['first_name', 'last_name', 'address']
duplicates = df.duplicated(subset = column_names,keep = False) # The keep
argument False means continue to show the duplicates in the data, the other
arguments for keep would be 'first', 'last' values
df[duplicates]
df[duplicates].sort_values(by = 'first_name')
# You can have complete row of duplicates or some columns in the entire row as
duplicates
# In case the all columns in the row are duplicates then you can
df.drop_duplicates(inplace = True) # inplace will drop duplicated rows directly
inside DataFrame without creating a new object
# For other type of duplicate conditions in the data. In this case first name,
last name all duplicates but height and weight are different between the
duplicate row. One way is to
column_names = ['first_name', 'last_name', 'address']
summaries = {'height' : 'max' , 'weight' : 'mean'} # max and mean are only an
example. you can change as per the need
df = df.groupby(by=column_names).agg(summaries).reset_index()
#Validate if the above process is done correctly
duplicates = df.duplicated(subset = column_names,keep = False)
df[duplicates].sort_values(by = 'first_name')
# Chapter - Text and Categorical Variables
# Member Constraint - Categorical variables that ideally should be in the data
and remove inconsistent categorical values. Eg: in blood group we cannot have Z+
blood. Hence these rows should be removed from analysis
# Find the cleanliness category in airlines not in categories
cat_notclean =
set(airlines['cleanliness']).difference(categories['cleanliness'])
# Find rows with that category
cat_notclean_rows = airlines['cleanliness'].isin(cat_notclean)
# Print rows with inconsistent category
print(airlines[cat_notclean_rows]) # These are the rows that need to be cleaned
and which is what we are doing in the next code
# Print rows with consistent categories only
print(airlines[~cat_notclean_rows])
# Next we are working on value inconsistency ,eg : Married, MARRIED,UNMARRIED,
Unmarried
df['marriage_status'] = df['marriage_status'].str.upper() # or .lower
df['marriage_status'].value_counts()
# at times some values can have trailing spaces
df['marriage_status'] = df['marriage_status'].str.strip()
# Creating Categorical data from the values in the data
#eg : income
# income_group column from income column
# Using cut() - create category ranges and names
ranges = [0,200000,500000,np.inf]
group_names = ['0-200K','200K-500K','500K+']
#Create income group column
demographics['income_group'] = pd.cut(demographics['household_income'],
bins=ranges, labels=group_names)
demographics[['income_group','household_income']]
# Create mapping dictionary and replace
mapping =
{'Microsoft':'DesktopOS','MacOS':'DesktopOS','Linux':'DesktopOS','IOS':'MobileOS
','Android':'MobileOS'}
devices['operating_system'] = devices['operating_system'].replace(mapping)
devices['operating_system'].unique()
#Cleaning Text Data
#Regular Expressions eg: Names, Phonenumbers, emails
#practice
# Store length of each row in survey_response column
resp_length = airlines['survey_response'].str.len()
# Find rows in airlines where resp_length > 40
airlines_survey = airlines[resp_length > 40]
# Assert minimum survey_response length is > 40
assert airlines_survey['survey_response'].str.len().min() > 40
# Print new survey_response column
print(airlines_survey['survey_response'])
# Uniform data
# temp in celsius and farenheit - need to standardize
#in the below case data format is in my combinations
df["Birthdate"] =pd.to_datetime(df["Birthdate"],infer_datetime_format = True,
errors = "coerce")
# The coerce will give nan values where date format is difficult to be checked
eg: 27/27/2002
# in case you know the date format for sure then you can write one such code as
an example below
df["Birthdate"] = df["Birthdate"].dt.strftime("%d-%m-%Y")
# any ambuigity like 2019-03-08 then you would need to do some homework to make
such decisions
# standardize amount in dollars
# Find values of acct_cur that are equal to 'euro'
acct_eu = banking['acct_cur'] == 'euro'
# Convert acct_amount where it is in euro to dollars
banking.loc[acct_eu, 'acct_amount'] = banking.loc[acct_eu, 'acct_amount'] * 1.1
# Unify acct_cur column by changing 'euro' values to 'dollar'
banking.loc[acct_eu, 'acct_cur'] = 'dollar'
# Assert that only dollar currency remains
assert banking['acct_cur'].unique() == 'dollar'
# Convert account_opened to datetime
banking['account_opened'] = pd.to_datetime(banking['account_opened'],
# Infer datetime format
infer_datetime_format = True,
# Return missing value for error
errors = 'coerce')
# Get year of account opened
banking['acct_year'] = banking['account_opened'].dt.strftime('%Y')
# Print acct_year
print(banking['acct_year'])
#Cross Field Validation
#check data integrity when merging data from different sources
# checking data between columns if they mean logical right eg e_class + b_class
+ p_class should match the total class column
sum_classes = df[['e_class','b_class','p_class']].sum(axis = 1)
consistent_sum = sum_classes == df['total_class']
inconsistent_df = df[~consistent_sum]
consistent_df = df[consistent_sum]
#just like above example checking for consistency between age and birthdate
import datetime as dt
df['birthday'] = pd.to_datetime(df['birthday'])
today = dt.date.today()
age_cal = today.year - df['birthday'].dt.year
age_equ = age_cal == df['age']
inconsistent_dfage = df[~age_equ]
consistent_dfage = df[age_equ]
# exercise example
# Store fund columns to sum against
fund_columns = ['fund_A', 'fund_B', 'fund_C', 'fund_D']
# Find rows where fund_columns row sum == inv_amount
inv_equ = banking[fund_columns].sum(axis = 1) == banking['inv_amount']
# Store consistent and inconsistent data
consistent_inv = banking[inv_equ]
inconsistent_inv = banking[~inv_equ]
# Store consistent and inconsistent data
print("Number of inconsistent investments: ", inconsistent_inv.shape[0])
#Completeness and Missing Data
# missing data represented by NA, nan,0,. etc...
# checking for NAs in dataframe
df.isna() # here you get only True or False for columns in df
df.isna().sum() # here you get the count of NAs for the columns in the df
#Missing data visualization
import missingno as msno
import matplotlib.pyplot .as plt
#plotting missing data
msno.matrix(df)
plt.show()
#from the above we observer CO2 data is missing in many records
missing = df[df['CO2'].isna()]
notmissing = df[~df['CO2'].isna()] # this is to write for not isna
missing.describe()
notmissing.describe()
sorted_df = df.sort_values(by = 'Temperature')
msno.matrix (sorted_df)
plt.show()
# from the above output, it is evident that CO2 values are missing for cold
temperature data. May be a sensor failure in cold temperatures
#missing at completely random
#missing at random (eg: data missing only for certain pockets of the cohorts due
to some issues, which means missing data for observed values. We know why it is
missing)
#missing not at random (missing data for unobserved values. may be temp not
available for greater than 50 degree as temperature is too hot for thermometer
to test. but difficult to say this)
#drop missing values
df_droppedmissingval = df.dropna(subset = ['CO2'])
df_droppedmissingval.describe()
#replacing with statistical measures
co2_mean = df['CO2'].mean()
df_mean_imputed = df.fillna({'CO2' : co2_mean})
df.describe()
#you can also use domain knowledge
#Compare Strings
#Record Linkage
#Minimum edit distance between text (eg intention vs execution)
# Store the unique values of cuisine_type in unique_types
unique_types = restaurants['cuisine_type'].unique()
# Calculate similarity of 'asian' to all values of unique_types
print(process.extract('asian', unique_types, limit = len(unique_types)))
# Calculate similarity of 'american' to all values of unique_types
print(process.extract('american', unique_types, limit = len(unique_types)))
# Calculate similarity of 'italian' to all values of unique_types
print(process.extract('italian', unique_types, limit = len(unique_types)))
# From the above we know for every record of cuisine_type what is the nearest in
the unique_type of cuisine list.
#we are now changing for 'italian' cuisine
# Create a list of matches, comparing 'italian' with the cuisine_type column
matches = process.extract('italian', restaurants['cuisine_type'],
limit=len(restaurants.cuisine_type))
# Iterate through the list of matches to italian
for match in matches:
# Check whether the similarity score is greater than or equal to 80
if match[1] >= 80 :
# Select all rows where the cuisine_type is spelled this way, and set them
to the correct cuisine
restaurants.loc[restaurants['cuisine_type'] == match[0], 'cuisine_type' ] =
'italian'
# but we need to do this for all the cuisines
categories = ['italian', 'asian', 'american']
# Iterate through categories
for cuisine in categories:
# Create a list of matches, comparing cuisine with the cuisine_type column
matches = process.extract(cuisine, restaurants['cuisine_type'],
limit=len(restaurants.cuisine_type))
# Iterate through the list of matches
for match in matches:
# Check whether the similarity score is greater than or equal to 80
if match[1] >= 80:
# If it is, select all rows where the cuisine_type is spelled this way,
and set them to the correct cuisine
restaurants.loc[restaurants['cuisine_type'] == match[0]] = cuisine
# Inspect the final result
print(restaurants['cuisine_type'].unique())
#Generating Pairs
# two columns dont match exactly because being differently name. Hence you need
record linkage
#recordlinkage package