1.
Write a Python program for Data pre-processing and include Data
cleaning steps
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import zscore
# Sample dataset creation
data = {
'Name': ['Alice', 'Bob', 'Charlie', None, 'Eve', 'Frank', 'Alice', 'Grace'],
'Age': [25, np.nan, 30, 35, 29, 120, 25, 33],
'Gender': ['Female', 'Male', 'Male', 'Female', None, 'Male', 'Female', 'Female'],
'Income': [50000, 60000, np.nan, 80000, 75000, 300000, 50000, 72000],
'Location': ['New York', 'San Francisco', 'Chicago', None, 'New York', 'Chicago', 'New York',
'Chicago']
# Load data into a DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
# Step 1: Handling Missing Values
# Fill missing numerical values with the column median
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].median())
# Fill missing categorical values with the mode
df['Name'] = df['Name'].fillna('Unknown')
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Location'] = df['Location'].fillna('Unknown')
# Step 2: Removing Duplicates
# Check for duplicates and drop them
df = df.drop_duplicates()
# Step 3: Handling Outliers
# Remove rows where numerical features have Z-scores > 3 (extreme outliers)
z_scores = zscore(df[['Age', 'Income']])
df = df[(np.abs(z_scores) < 3).all(axis=1)]
# Step 4: Encoding Categorical Variables
# Convert categorical columns to numerical using Label Encoding
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender']) # Female = 0, Male = 1
df['Location'] = label_encoder.fit_transform(df['Location']) # Encoding Location
# Step 5: Removing Irrelevant Features
# Drop the 'Name' column as it's non-informative for analysis
df = df.drop(columns=['Name'])
# Step 6: Scaling Features
# Normalize numerical features using StandardScaler
scaler = StandardScaler()
df[['Age', 'Income']] = scaler.fit_transform(df[['Age', 'Income']])
# Final Cleaned DataFrame
print("\nCleaned DataFrame:")
print(df)
2. Write a Python program and include Data Integration steps and Data
cleaning steps
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Sample Datasets
data1 = {
'CustomerID': [1, 2, 3, 4],
'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'Age': [25, np.nan, 30, 35],
'Gender': ['Female', 'Male', 'Male', 'Female']
data2 = {
'CustomerID': [3, 4, 5, 6],
'Income': [60000, 80000, np.nan, 75000],
'Location': ['New York', 'Chicago', None, 'San Francisco']
# Step 1: Load Data
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print("Dataset 1:")
print(df1)
print("\nDataset 2:")
print(df2)
# Step 2: Data Integration (Merging Datasets)
# Merge datasets on CustomerID
df = pd.merge(df1, df2, on='CustomerID', how='outer')
print("\nIntegrated Dataset:")
print(df)
df10 = pd.merge(df1, df2, on='CustomerID', how='inner')
print("\nIntegrated Dataset:")
print(df10)
# Step 3: Handling Missing Values
# Fill missing numerical values with the median
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].median())
# Fill missing categorical values with the mode
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Location'] = df['Location'].fillna('Unknown')
# Step 4: Removing Duplicates
df = df.drop_duplicates()
# Step 5: Encoding Categorical Variables
# Encode Gender and Location columns
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Location'] = label_encoder.fit_transform(df['Location'])
# Step 6: Scaling Numerical Features
scaler = StandardScaler()
df[['Age', 'Income']] = scaler.fit_transform(df[['Age', 'Income']])
# Final Processed Dataset
print("\nProcessed Dataset:")
print(df)
3. Write a Python Program and include data reduction functions
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
# Sample Dataset
data = {
'Feature1': [2.5, 3.0, 2.8, 3.2, 2.7, 3.5, 2.9],
'Feature2': [1.2, 1.5, 1.3, 1.7, 1.4, 1.8, 1.6],
'Feature3': [0.8, 0.9, 1.0, 0.7, 1.1, 0.6, 1.2],
'Feature4': [10, 20, 15, 10, 20, 15, 10],
'Target': [0, 1, 1, 0, 1, 0, 1]
}
# Load data into a DataFrame
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)
# Splitting features and target
X = df.drop(columns=['Target'])
y = df['Target']
# Step 1: Feature Scaling
# Scale features to normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 2: Dimensionality Reduction using PCA
# Reduce dimensions to 2 principal components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
print("\nData after PCA (Dimensionality Reduction):")
print(pd.DataFrame(X_pca, columns=['PC1', 'PC2']))
# Step 3: Feature Selection
# Select top 2 features based on statistical significance
selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X, y)
print("\nData after Feature Selection (Top 2 Features):")
print(pd.DataFrame(X_selected, columns=['Feature1', 'Feature2']))
# Step 4: Sampling (Data Size Reduction)
# Split data into a smaller sample for testing
X_train, X_sample, y_train, y_sample = train_test_split(X, y, test_size=0.4,
random_state=42)
print("\nSampled Data (40% of Original Dataset):")
print(pd.DataFrame(X_sample, columns=X.columns))
4. Write a Python Program and include data transformation
functions
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from scipy.stats import boxcox
# Sample Dataset
data = {
'Feature1': [10, 20, 30, 40, 50],
'Feature2': [1.2, 3.5, 5.1, 7.3, 9.0],
'Feature3': [1000, 1500, 2000, 2500, 3000],
'Category': ['A', 'B', 'A', 'C', 'B']
# Load data into a DataFrame
df = pd.DataFrame(data)
print("Original Dataset:")
print(df)
# Step 1: Normalization
# Normalize Feature2 to a range of [0, 1]
minmax_scaler = MinMaxScaler()
df['Feature2_Normalized'] = minmax_scaler.fit_transform(df[['Feature2']])
# Step 2: Standardization
# Standardize Feature1 to have a mean of 0 and standard deviation of 1
std_scaler = StandardScaler()
df['Feature1_Standardized'] = std_scaler.fit_transform(df[['Feature1']])
# Step 3: Log Transformation
# Apply log transformation to Feature3 to reduce skewness
df['Feature3_Log'] = np.log(df['Feature3'])
# Step 4: Power Transformation (Box-Cox Transformation)
# Apply Box-Cox transformation to Feature2 (requires positive values)
df['Feature2_BoxCox'], _ = boxcox(df['Feature2'])
# Step 5: Encoding Categorical Variables
# Label Encoding for Category
label_encoder = LabelEncoder()
df['Category_LabelEncoded'] = label_encoder.fit_transform(df['Category'])
# One-Hot Encoding for Category
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(df[['Category']])
onehot_encoded_df = pd.DataFrame(onehot_encoded,
columns=onehot_encoder.get_feature_names_out(['Category']))
df = pd.concat([df, onehot_encoded_df], axis=1)
# Display Transformed Dataset
print("\nTransformed Dataset:")
print(df)
5. Write a Python program for Feature Engineering concepts.
(include Titanic dataset)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
# Load Titanic dataset (replace 'train.csv' with the path to your dataset)
data = pd.read_csv('train.csv')
# Display the first few rows of the dataset
print("Initial Data Sample:")
print(data.head())
# Function for feature engineering
def feature_engineering(data):
# Make a copy of the dataset
df = data.copy()
# Handle missing values
imputer_age = SimpleImputer(strategy='median')
df['Age'] = imputer_age.fit_transform(df[['Age']])
imputer_embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = imputer_embarked.fit_transform(df[['Embarked']])
# Drop columns that are not useful
df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
# Encode categorical variables
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
embarked_encoded = one_hot_encoder.fit_transform(df[['Embarked']])
embarked_encoded_df = pd.DataFrame(embarked_encoded,
columns=one_hot_encoder.get_feature_names_out(['Embarked']))
df = pd.concat([df, embarked_encoded_df], axis=1)
df.drop(['Embarked'], axis=1, inplace=True)
# Create new features
df['FamilySize'] = df['SibSp'] + df['Parch']
df['IsAlone'] = (df['FamilySize'] == 0).astype(int)
# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Fare']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
return df
# Perform feature engineering on the Titanic dataset
processed_data = feature_engineering(data)
# Display the processed data
print("\nProcessed Data Sample:")
print(processed_data.head())
# Save the processed data to a new CSV file
processed_data.to_csv('processed_titanic_data.csv', index=False)
print("\nProcessed data saved to 'processed_titanic_data.csv'.")