import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Step 1: Load the dataset (here using UCI Heart Disease dataset)
# You can download the dataset using pd.read_csv() from a local file or a URL.
url="https://raw.githubusercontent.com/datablist/sample-csvfiles/main/files/
people/people-100.csv"
# Replace with your dataset URL
df = pd.read_csv(url)
# Step 2: Data Cleaning - Handling missing values
# Checking for missing values
print("Missing values before cleaning:\n", df.isnull().sum())
# Filling missing values for numerical columns with the column's mean
df.fillna(df.mean(), inplace=True)
print("Missing values after cleaning:\n", df.isnull().sum())
# Step 3: String Manipulation - Clean text columns
# Convert all text in 'name' column to lowercase and remove extra spaces
df['name'] = df['name'].str.lower().str.strip()
# Step 4: Use NumPy - Convert numerical columns to NumPy arrays
# Convert 'age' and 'salary' columns into NumPy arrays and calculate basic
statistics
age_array = df['age'].to_numpy()
salary_array = df['salary'].to_numpy()
# Calculate basic statistics: mean and median
age_mean = np.mean(age_array)
age_median = np.median(age_array)
salary_mean = np.mean(salary_array)
salary_median = np.median(salary_array)
print(f"Age - Mean: {age_mean}, Median: {age_median}")
print(f"Salary - Mean: {salary_mean}, Median: {salary_median}")
# Step 5: Data Splitting - Split dataset into training and testing sets
# We will predict salary based on age
X = df[['age']] # Feature (Independent variable)
y = df['salary'] # Target (Dependent variable)
# Splitting the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Step 6: Build a Model - Using Linear Regression
# Initialize the model
model = LinearRegression()
# Train the model using the training data
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Step 7: Evaluate the Model - Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
# Step 8: Report the results
print("\nModel successfully built and evaluated.")