Ashutosh Anand
DL Assignment Lab 3 Employee
202318035
DATASET 3: EMPLOYEE
In [ ]: import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
file_path = '/content/Employee.csv'
employee_data = pd.read_csv(file_path)
# Display the first few rows of the dataset
employee_data.head()
from sklearn.metrics import accuracy_score, classification_report
# Data preprocessing
employee_data['Gender'] = employee_data['Gender'].map({'Male': 1, 'Female': 0})
employee_data['EverBenched'] = employee_data['EverBenched'].map({'Yes': 1, 'No': 0})
employee_data = employee_data.drop(columns=['Education', 'City', 'JoiningYear'])
# Display the cleaned dataset
employee_data.head()
# Gender distribution plot
sns.countplot(x='Gender', data=employee_data)
plt.title('Gender Distribution')
plt.show()
# Correlation matrix heatmap
sns.heatmap(employee_data.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()
# Age distribution by gender
sns.histplot(data=employee_data, x='Age', hue='Gender', multiple='stack', kde=True)
plt.title('Age Distribution by Gender')
plt.show()
# Experience in current domain by gender
sns.boxplot(data=employee_data, x='Gender', y='ExperienceInCurrentDomain')
plt.title('Experience in Current Domain by Gender')
plt.show()
# Payment tier distribution by gender
sns.countplot(x='PaymentTier', data=employee_data, hue='Gender')
plt.title('Payment Tier Distribution by Gender')
plt.show()
In [ ]: from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Feature scaling
scaler = StandardScaler()
X = employee_data.drop(columns=['Gender'])
X_scaled = scaler.fit_transform(X)
# Define features and target
y = employee_data['Gender']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=4
# Initialize and train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
# Predict on the test set
predictions = logistic_model.predict(X_test)
# Evaluate the model
model_accuracy = accuracy_score(y_test, predictions)
classification_report_result = classification_report(y_test, predictions)
print(f'Accuracy: {model_accuracy:.4f}')
print('Classification Report:')
print(classification_report_result)
Accuracy: 0.6584317937701396
Classification Report:
precision recall f1-score support
0 0.72 0.29 0.42 388
1 0.65 0.92 0.76 543
accuracy 0.66 931
macro avg 0.68 0.61 0.59 931
weighted avg 0.68 0.66 0.62 931
In [ ]: import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
# Convert features and target to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1) # Reshape for
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Define the neural network model
class ANNModel(nn.Module):
def __init__(self):
super(ANNModel, self).__init__()
self.fc1 = nn.Linear(X_train_tensor.shape[1], 64)
self.fc2 = nn.Linear(64, 64)
self.fc_out = nn.Linear(64, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = torch.sigmoid(self.fc_out(x)) # Output layer for binary classification
return x
# Initialize the model, loss function, and optimizer
model = ANNModel()
criterion = nn.BCELoss() # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
num_epochs = 50
for epoch in range(num_epochs):
model.train()
total_loss = 0.0
for batch_inputs, batch_targets in train_loader:
optimizer.zero_grad()
predictions = model(batch_inputs)
loss = criterion(predictions, batch_targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
# Optionally print training progress
print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}')
# Evaluation
model.eval()
predictions_list = []
with torch.no_grad():
for batch_inputs, _ in test_loader:
batch_predictions = model(batch_inputs)
predictions_list.append(batch_predictions)
# Concatenate all predictions and convert to numpy array
all_predictions = torch.cat(predictions_list).numpy()
all_predictions = (all_predictions > 0.5).astype(int) # Convert probabilities to binary class
# Evaluate the model
accuracy = accuracy_score(y_test, all_predictions)
report = classification_report(y_test, all_predictions)
print(f'Accuracy: {accuracy:.4f}')
print(f'Classification Report:\n{report}')
Accuracy: 0.6702470461868958
Classification Report:
precision recall f1-score support
0 0.72 0.34 0.46 388
1 0.66 0.90 0.76 543
accuracy 0.67 931
macro avg 0.69 0.62 0.61 931
weighted avg 0.68 0.67 0.64 931