Sanket ML Assign1

Download as pdf or txt
Download as pdf or txt
You are on page 1of 9

NAME : Sanket Sarode

DIV : B BATCH : B2

ROLL NO : 130

ID : 21102045

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

# Load dataset
data = pd.read_csv('target.csv')

# Descriptive statistics
print(data.describe())

# Check for null values


print(data.isnull().sum())

# Drop rows with null values (if any) data


= data.dropna()

# Convert categorical columns to numerical data['salary'] =


data['salary'].map({'low': 0, 'medium': 1, 'high':
2})
data['Department'] = data['Department'].astype('category').cat.codes

satisfaction_level last_evaluation number_project \ count


14999.000000 14999.000000 14999.000000 mean 0.612834 0.716102
3.803054 std 0.248631 0.171169 1.232592 min
0.090000 0.360000 2.000000 25%
0.440000 0.560000 3.000000
50% 0.640000 0.720000 4.000000 75% 0.820000 0.870000 5.000000
max
1.000000 1.000000 7.000000
average_montly_hours time_spend_company Work_accident left
\
count 14999.000000 14999.000000 14999.000000 14999.000000
mean 201.050337 3.498233 0.144610 0.238083
std 49.943099 1.460136 0.351719
0.425924
min 96.000000 2.000000 0.000000 0.000000
25% 156.000000 3.000000 0.000000 0.000000
50% 200.000000 3.000000 0.000000 0.000000
75% 245.000000 4.000000 0.000000 0.000000
max 310.000000 10.000000 1.000000 1.000000

promotion_last_5years
count 14999.000000
mean 0.021268 std
0.144281 min
0.000000 25%
0.000000
50% 0.000000
75% 0.000000
max 1.000000
satisfaction_level 0
last_evaluation 0
number_project 0
average_montly_hours 0
time_spend_company 0
Work_accident 0
left 0
promotion_last_5years 0
Department 0
salary 0
dtype: int64

# Boxplot to detect outliers


sns.boxplot(data=data[['satisfaction_level',
'last_evaluation', 'number_project', 'average_montly_hours',
'time_spend_company']]) plt.show()

# Optional: Remove outliers based on some criteria (e.g., z-score, IQR


method)
# Correlation matrix
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix") plt.show()
# Bar chart: Salary vs Retention
sns.barplot(x='salary', y='left', data=data)
plt.title("Impact of Salary on Employee
Retention") plt.xlabel("Salary Level")
plt.ylabel("Employee Left (1 = Left, 0 =
Stayed)") plt.show()
# Bar chart: Department vs Retention
sns.barplot(x='Department', y='left', data=data)
plt.title("Correlation Between Department and Employee
Retention") plt.xlabel("Department")
plt.ylabel("Employee Left (1 = Left, 0 = Stayed)")
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from
sklearn.metrics import accuracy_score,
precision_score, recall_score, confusion_matrix,
roc_curve, roc_auc_score import matplotlib.pyplot as
plt import seaborn as sns

# Load and prepare your dataset (assuming it's preprocessed) #


data = pd.read_csv('employee_data.csv') # If not already loaded

# Define features and target variable


X = data.drop('left', axis=1) # Features (excluding the target
column 'left')
y = data['left'] # Target (employee retention)

# Split data into training and testing sets (70% train, 30%
test) X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model


log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
# Predict on the test set y_pred
= log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plotting confusion matrix


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d',
cmap='Blues') plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted') plt.show()

# Calculate ROC curve and AUC score


fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plotting ROC Curve


plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='orange', label=f'ROC Curve (AUC
= {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='darkblue',
linestyle='--') plt.title('Receiver Operating
Characteristic (ROC) Curve') plt.xlabel('False Positive
Rate') plt.ylabel('True
Positive Rate') plt.legend() plt.show()

# Calculate accuracy, precision, and


recall accuracy = accuracy_score(y_test,
y_pred) precision =
precision_score(y_test, y_pred) recall =
recall_score(y_test, y_pred)

# Sensitivity (same as recall) and Specificity


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn) # Sensitivity = TP / (TP +
FN) specificity = tn / (tn + fp) # Specificity = TN / (TN
+ FP)

# Print evaluation metrics print(f"Accuracy:


{accuracy:.2f}") print(f"Precision:
{precision:.2f}") print(f"Recall
(Sensitivity): {recall:.2f}")
print(f"Specificity: {specificity:.2f}")

Accuracy: 0.78
Precision: 0.59
Recall (Sensitivity): 0.33
Specificity: 0.93

You might also like