import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import os
# Set a global seaborn style
sns.set(style="whitegrid", palette="coolwarm") # You can also try 'darkgrid',
'ticks', etc.
# Update font style globally
plt.rcParams.update({'font.size': 12, 'font.family': 'serif'}) # Set font size and
family globally
# Create the directory for visualizations
visualizations_directory = 'visualizations/'
if not os.path.exists(visualizations_directory):
os.makedirs(visualizations_directory)
# Create the directory for cleaned data
cleaned_data_directory = 'cleaned_data_files/'
if not os.path.exists(cleaned_data_directory):
os.makedirs(cleaned_data_directory)
def column_wise_analysis(data):
"""
Analyzes columns for data types, unique values, and numerical summaries.
"""
print("Data Types:\n", data.dtypes)
for column in data.columns:
print(f"\nColumn: {column}")
print(f"Unique Values: {data[column].nunique()}")
if data[column].dtype in ['float64', 'int64']:
print(f"Summary:\n{data[column].describe()}\n")
else:
print(f"Value Counts:\n{data[column].value_counts().head(5)}\n")
def data_cleaning(data):
"""
Cleans the dataset by handling missing values and inconsistencies.
"""
# Handle Missing Values
for column in data.select_dtypes(include=['float64', 'int64']).columns:
if data[column].isnull().any():
data[column] = data[column].fillna(data[column].mean()) # Fill
numerical NaNs with the column mean
for column in data.select_dtypes(include=['object']).columns:
if data[column].isnull().any():
data[column] = data[column].fillna(data[column].mode()[0]) # Fill
categorical NaNs with the mode
# Ensure uniformity in categorical columns (convert to lowercase)
for column in data.select_dtypes(include=['object']).columns:
data[column] = data[column].str.lower()
# Handle outliers (optional, you can adjust the thresholds or remove outliers
if needed)
for column in data.select_dtypes(include=['float64', 'int64']).columns:
upper_limit = data[column].mean() + 3 * data[column].std()
lower_limit = data[column].mean() - 3 * data[column].std()
data[column] = data[column].clip(lower=lower_limit, upper=upper_limit) #
Clip outliers to within limits
return data
def identify_critical_columns(data):
"""
Identifies and visualizes critical columns in the dataset.
"""
print("Columns in the dataset:", data.columns) # Debugging: Print the columns
# Update the critical columns list based on your dataset
critical_columns = [col for col in ['TOTALCOST', 'REPORTING_COST', 'LBRCOST',
'KM'] if col in data.columns]
if not critical_columns:
print("No critical columns found in the dataset.")
return
print("\nReasoning for Selected Critical Columns:") # Reasoning for selection
reasoning = {
'TOTALCOST': "Total expenditure; critical for profitability analysis.",
'REPORTING_COST': "Relevant for tracking costs associated with repairs.",
'LBRCOST': "Labor costs involved in repairs.",
'KM': "Kilometers, relevant for repair frequency analysis."
}
for col in critical_columns:
print(f"{col.upper()}: {reasoning.get(col, 'No reasoning available for this
column')}")
# Visualize the critical column (bar plot)
plt.figure(figsize=(10, 6))
data[col].plot(kind='bar', color='skyblue')
plt.title(f'{col.upper()} Bar Plot')
plt.xlabel('Index')
plt.ylabel(col.upper())
plt.tight_layout()
plt.savefig(f'{visualizations_directory}{col}_barplot.png') # Save plot as
image file
plt.show()
def generate_tags(data, text_column='CAUSAL_PART_NM'):
"""
Generates tags/features from free text in the dataset (if applicable).
"""
if text_column in data.columns:
vectorizer = CountVectorizer(stop_words='english', max_features=10)
data[text_column] = data[text_column].fillna('') # Replace NaNs with empty
strings for text processing
tags = vectorizer.fit_transform(data[text_column]) # Generate tag features
print("\nGenerated Tags:", vectorizer.get_feature_names_out()) # Display
top tags
else:
print(f"\nColumn '{text_column}' not found. Skipping tag generation.")
def visualize_data(data):
"""
Create various visualizations for the dataset.
"""
# Bar Charts for categorical data
categorical_columns = ['CAUSAL_PART_NM', 'PLATFORM', 'BODY_STYLE',
'DEALER_NAME', 'STATE']
for col in categorical_columns:
if col in data.columns:
plt.figure(figsize=(10, 6))
data[col].value_counts().plot(kind='bar', color='skyblue')
plt.title(f'{col} Distribution')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}{col}_barchart.png', dpi=300)
# Save with high resolution
plt.show()
# Line Chart for time-series data (REPAIR_DATE)
if 'REPAIR_DATE' in data.columns:
plt.figure(figsize=(10, 6))
data['REPAIR_DATE'] = pd.to_datetime(data['REPAIR_DATE'], errors='coerce')
# Convert to datetime
data.groupby(data['REPAIR_DATE'].dt.to_period('M')).size().plot(kind='line',
marker='o', color='orange')
plt.title('Repairs Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Repairs')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}repair_date_linechart.png',
dpi=300)
plt.show()
# Scatter Plot for KM vs TOTALCOST
if 'KM' in data.columns and 'TOTALCOST' in data.columns:
plt.figure(figsize=(10, 6))
plt.scatter(data['KM'], data['TOTALCOST'], color='green')
plt.title('Scatter Plot: KM vs TOTALCOST')
plt.xlabel('Kilometers')
plt.ylabel('Total Cost')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}km_vs_totalcost_scatterplot.png',
dpi=300)
plt.show()
# Histogram for the Distribution of REPAIR_AGE
if 'REPAIR_AGE' in data.columns:
plt.figure(figsize=(10, 6))
data['REPAIR_AGE'].plot(kind='hist', bins=30, edgecolor='black',
color='purple')
plt.title('Distribution of REPAIR_AGE')
plt.xlabel('Repair Age')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}repair_age_histogram.png', dpi=300)
plt.show()
# Heatmap for Correlation Between Numerical Columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
if len(numerical_columns) > 1:
correlation_matrix = data[numerical_columns].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f',
cbar=True)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}correlation_heatmap.png', dpi=300)
plt.show()
# Pie Chart for TRANSACTION_CATEGORY Distribution
if 'TRANSACTION_CATEGORY' in data.columns:
plt.figure(figsize=(8, 8))
data['TRANSACTION_CATEGORY'].value_counts().plot(kind='pie', autopct='%1.1f
%%', startangle=90)
plt.title('Transaction Category Distribution')
plt.ylabel('')
plt.tight_layout()
plt.savefig(f'{visualizations_directory}transaction_category_piechart.png',
dpi=300)
plt.show()
def main():
"""
Main function to execute data analysis, cleaning, and visualization steps.
"""
try:
# Load the dataset
file_path = r'D:\Downloads(D)\Chrome\Data for Task 1. (1).xlsx'
data = pd.read_excel(file_path) # Load data from Excel file
print("Dataset loaded successfully.")
# Display initial dataset overview
print("\nInitial Dataset Overview:")
print(data.head())
print(data.info())
# Step 1: Perform column-wise analysis
print("\nPerforming column-wise analysis...")
column_wise_analysis(data)
# Step 2: Clean the data
print("\nCleaning data...")
cleaned_data = data_cleaning(data)
print("\nData after cleaning:")
print(cleaned_data.head())
# Step 3: Identify and analyze critical columns
print("\nIdentifying and analyzing critical columns...")
identify_critical_columns(cleaned_data)
# Step 4: Generate tags/features from free text
print("\nGenerating tags from free text (if applicable)...")
generate_tags(cleaned_data, text_column='CAUSAL_PART_NM')
# Step 5: Visualize the data
print("\nVisualizing the data...")
visualize_data(cleaned_data)
# Step 6: Save the cleaned data to a CSV file
output_path = os.path.join(cleaned_data_directory, 'cleaned_data.csv')
cleaned_data.to_csv(output_path, index=False)
print(f"\nCleaned data saved to {output_path}")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == '__main__':
main()