0% found this document useful (0 votes)
5 views

Sanket ML Assign1

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Sanket ML Assign1

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

NAME : Sanket Sarode

DIV : B BATCH : B2

ROLL NO : 130

ID : 21102045

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

# Load dataset
data = pd.read_csv('target.csv')

# Descriptive statistics
print(data.describe())

# Check for null values


print(data.isnull().sum())

# Drop rows with null values (if any) data


= data.dropna()

# Convert categorical columns to numerical data['salary'] =


data['salary'].map({'low': 0, 'medium': 1, 'high':
2})
data['Department'] = data['Department'].astype('category').cat.codes

satisfaction_level last_evaluation number_project \ count


14999.000000 14999.000000 14999.000000 mean 0.612834 0.716102
3.803054 std 0.248631 0.171169 1.232592 min
0.090000 0.360000 2.000000 25%
0.440000 0.560000 3.000000
50% 0.640000 0.720000 4.000000 75% 0.820000 0.870000 5.000000
max
1.000000 1.000000 7.000000
average_montly_hours time_spend_company Work_accident left
\
count 14999.000000 14999.000000 14999.000000 14999.000000
mean 201.050337 3.498233 0.144610 0.238083
std 49.943099 1.460136 0.351719
0.425924
min 96.000000 2.000000 0.000000 0.000000
25% 156.000000 3.000000 0.000000 0.000000
50% 200.000000 3.000000 0.000000 0.000000
75% 245.000000 4.000000 0.000000 0.000000
max 310.000000 10.000000 1.000000 1.000000

promotion_last_5years
count 14999.000000
mean 0.021268 std
0.144281 min
0.000000 25%
0.000000
50% 0.000000
75% 0.000000
max 1.000000
satisfaction_level 0
last_evaluation 0
number_project 0
average_montly_hours 0
time_spend_company 0
Work_accident 0
left 0
promotion_last_5years 0
Department 0
salary 0
dtype: int64

# Boxplot to detect outliers


sns.boxplot(data=data[['satisfaction_level',
'last_evaluation', 'number_project', 'average_montly_hours',
'time_spend_company']]) plt.show()

# Optional: Remove outliers based on some criteria (e.g., z-score, IQR


method)
# Correlation matrix
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix") plt.show()
# Bar chart: Salary vs Retention
sns.barplot(x='salary', y='left', data=data)
plt.title("Impact of Salary on Employee
Retention") plt.xlabel("Salary Level")
plt.ylabel("Employee Left (1 = Left, 0 =
Stayed)") plt.show()
# Bar chart: Department vs Retention
sns.barplot(x='Department', y='left', data=data)
plt.title("Correlation Between Department and Employee
Retention") plt.xlabel("Department")
plt.ylabel("Employee Left (1 = Left, 0 = Stayed)")
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from
sklearn.metrics import accuracy_score,
precision_score, recall_score, confusion_matrix,
roc_curve, roc_auc_score import matplotlib.pyplot as
plt import seaborn as sns

# Load and prepare your dataset (assuming it's preprocessed) #


data = pd.read_csv('employee_data.csv') # If not already loaded

# Define features and target variable


X = data.drop('left', axis=1) # Features (excluding the target
column 'left')
y = data['left'] # Target (employee retention)

# Split data into training and testing sets (70% train, 30%
test) X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model


log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
# Predict on the test set y_pred
= log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plotting confusion matrix


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d',
cmap='Blues') plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted') plt.show()

# Calculate ROC curve and AUC score


fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plotting ROC Curve


plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='orange', label=f'ROC Curve (AUC
= {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='darkblue',
linestyle='--') plt.title('Receiver Operating
Characteristic (ROC) Curve') plt.xlabel('False Positive
Rate') plt.ylabel('True
Positive Rate') plt.legend() plt.show()

# Calculate accuracy, precision, and


recall accuracy = accuracy_score(y_test,
y_pred) precision =
precision_score(y_test, y_pred) recall =
recall_score(y_test, y_pred)

# Sensitivity (same as recall) and Specificity


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn) # Sensitivity = TP / (TP +
FN) specificity = tn / (tn + fp) # Specificity = TN / (TN
+ FP)

# Print evaluation metrics print(f"Accuracy:


{accuracy:.2f}") print(f"Precision:
{precision:.2f}") print(f"Recall
(Sensitivity): {recall:.2f}")
print(f"Specificity: {specificity:.2f}")

Accuracy: 0.78
Precision: 0.59
Recall (Sensitivity): 0.33
Specificity: 0.93

You might also like