lab-5-nguyenngocmaithi-20130120

Download as pdf or txt
Download as pdf or txt
You are on page 1of 20

lab-5-nguyenngocmaithi-20130120

October 29, 2024

1 DM Lab 5: Regression

2 20130120 - Nguyễn Ngọc Mai Thi

3 Regression
• Classification predicts a categorical value
– a finite set of values
• Regression predicts a numerical value
– a possibly infinite set of values
– can be interpolating or extrapolating

3.1 Regression Estimators


Regression estimators work in the same way as classification estimators in scikit-learn:
Linear Regression Models: - LinearRegression - Ridge
K-Nearest Neighbor Regression: - KNeighborsRegressor
Decision Tree Regression: - DecisionTreeRegressor
Neural Network Regression: - MLPRegressor
[1]: from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/MyDrive/Colab Notebooks/Lab5'

Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/Lab5
#Import libraries
[12]: import pandas as pd
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

1
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, SelectFwe
from sklearn.feature_selection import f_regression

#Task 1. With dataset1


#Task 1.1. Load dataset1 using Pandas and split 70% for training set and 30% for test set
[3]: import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset1.


↪csv')

train_set, test_set = train_test_split(dataset, test_size=0.3, random_state=42)


print(f"Training set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

Training set size: 30


Test set size: 14
#Task 1.2. Generate plots to determine the relationship between each combination as belows (i.e.,
between age and temp, between age and weight, …)

[39]: data = pd.read_csv('dataset1.csv')

print(data.columns)

columns = ['age', 'temp', 'weight', 'length']

# Tạo biểu đồ cho tất cả các cặp cột


for i in range(len(columns)):
for j in range(i + 1, len(columns)):
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x=columns[i], y=columns[j], hue='temp',␣
↪palette='coolwarm')

plt.title(f'Relationship between {columns[i]} and {columns[j]}')


plt.xlabel(columns[i])
plt.ylabel(columns[j])
plt.show()

Index(['age', 'temp', 'weight', 'length'], dtype='object')

2
3
4
5
6
7
#Task 1.3. Apply LinearRegression to which combinations that have linear relationship.
[33]: def perform_linear_regression(X, y):
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

return model, y_pred

X_train = train_set[['age']]
y_train = train_set['weight']
model_weight, y_pred_weight = perform_linear_regression(X_train, y_train)

print(f'Linear Regression Coefficients (Weight): {model_weight.coef_},␣


↪Intercept: {model_weight.intercept_}')

plt.figure(figsize=(8, 6))
plt.scatter(train_set['age'], train_set['weight'], color='blue', label='Actual␣
↪data')

8
plt.plot(train_set['age'], y_pred_weight, color='red', label='Regression line')
plt.title('Linear Regression: Age vs Weight')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()

Linear Regression Coefficients (Weight): [1.90225857], Intercept:


19.10576323987536

#Task 1.4. Plot train/test/prediction to visualize the relationship between them

[ ]: y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
plt.figure(figsize=(12, 6))

plt.subplot(1, 3, 1)
plt.scatter(y_train, y_train_pred, color='blue', alpha=0.6)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--',␣
↪lw=3)

9
plt.title('Training Set: Actual vs Predicted')
plt.xlabel('Actual Values (Train)')
plt.ylabel('Predicted Values (Train)')

plt.subplot(1, 3, 2)
plt.scatter(y_test, y_test_pred, color='green', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--',␣
↪lw=3)

plt.title('Test Set: Actual vs Predicted')


plt.xlabel('Actual Values (Test)')
plt.ylabel('Predicted Values (Test)')

plt.subplot(1, 3, 3)
plt.scatter(X_train[features[0]], y_train, color='blue', label='Train', alpha=0.
↪6)

plt.scatter(X_test[features[0]], y_test, color='green', label='Test', alpha=0.6)


plt.scatter(X_test[features[0]], y_test_pred, color='red', label='Prediction',␣
↪alpha=0.6)

plt.title('Train vs Test vs Prediction')


plt.xlabel(features[0])
plt.ylabel('Target Value')
plt.legend()
plt.tight_layout()
plt.show()

#Task 1.5. Apply Polynomial regression to which combinations that have non-linear relationship

10
[ ]: dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset1.
↪csv')

print(dataset.columns)
sns.scatterplot(x='age', y='length', data=dataset)
plt.title('Scatter plot of age vs length')
plt.show()

Index(['age', 'temp', 'weight', 'length'], dtype='object')

#Task 1.6. Plot train/test/prediction to visualize the relationship between them

[ ]: poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)
y_train_pred = model_poly.predict(X_train_poly)
y_test_pred = model_poly.predict(X_test_poly)

11
print(f"Mean Squared Error (Train): {mean_squared_error(y_train,␣
↪y_train_pred)}")

print(f"Mean Squared Error (Test): {mean_squared_error(y_test, y_test_pred)}")

print(f"R-squared (Train): {r2_score(y_train, y_train_pred)}")


print(f"R-squared (Test): {r2_score(y_test, y_test_pred)}")

Mean Squared Error (Train): 212086.52815239134


Mean Squared Error (Test): 176684.3912856702
R-squared (Train): 0.8918695733536053
R-squared (Test): 0.8545284119833094
#Task 1.7. Measure the performance of the different regression models trained before. Use 10-fold
cross validation and RMSE as well as R² for evaluation.
[ ]: linear_regression_model = LinearRegression()
linear_regression_model.fit(X, y)
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X, y)

random_forest_model = RandomForestRegressor()
random_forest_model.fit(X, y)

kf = KFold(n_splits=10, shuffle=True, random_state=42)


models = {
'Linear Regression': linear_regression_model,
'Decision Tree': decision_tree_model,
'Random Forest': random_forest_model

for name, model in models.items():


y_pred = cross_val_predict(model, X, y, cv=kf)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print(f"{name}: RMSE = {rmse:.4f}, R² = {r2:.4f}")

Linear Regression: RMSE = 674.0597, R² = 0.7368


Decision Tree: RMSE = 732.3463, R² = 0.6893
Random Forest: RMSE = 627.8487, R² = 0.7716
##Task 1.8. Apply GridSeachCV to find the influence of k on performance of KNeighborsRe-
gressor
[ ]: from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

12
knn = KNeighborsRegressor()
param_grid = {
'n_neighbors': np.arange(1, 21)
}
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False,␣
↪squared=False)

grid_search = GridSearchCV(estimator=knn,
param_grid=param_grid,
cv=10,
scoring=rmse_scorer,
n_jobs=-1)
grid_search.fit(X, y)
print(f"Best parameter (k): {grid_search.best_params_['n_neighbors']}")
print(f"Best RMSE score: {-grid_search.best_score_:.4f}")
results = grid_search.cv_results_

for mean_score, params in zip(results['mean_test_score'], results['params']):


print(f"k = {params['n_neighbors']}: RMSE = {-mean_score:.4f}")

Best parameter (k): 2


Best RMSE score: 633.5679
k = 1: RMSE = 782.6614
k = 2: RMSE = 633.5679
k = 3: RMSE = 663.7089
k = 4: RMSE = 654.0621
k = 5: RMSE = 666.7681
k = 6: RMSE = 655.1836
k = 7: RMSE = 653.9601
k = 8: RMSE = 673.2680
k = 9: RMSE = 707.4237
k = 10: RMSE = 720.4250
k = 11: RMSE = 723.3603
k = 12: RMSE = 755.7415
k = 13: RMSE = 776.4079
k = 14: RMSE = 792.3212
k = 15: RMSE = 785.2702
k = 16: RMSE = 801.2186
k = 17: RMSE = 791.0599
k = 18: RMSE = 809.8162
k = 19: RMSE = 824.2354
k = 20: RMSE = 845.0294
#Task 2. With dataset2
#Task 2.1. Load dataset2, apply StandardScaler() to standardize datataset and repare trainning
set, test set. Notice that, y is Birthweigth column; id column needs to drop.

13
[55]: train_data = pd.read_csv('dataset2_train.csv', delimiter=';', engine='python')
test_data = pd.read_csv('dataset2_test.csv', delimiter=';', engine='python')
# Bỏ cột 'id'
train_data = train_data.drop(columns=['id'], errors='ignore')
test_data = test_data.drop(columns=['id'], errors='ignore')

# Phân chia features (X) và target (y)


X_train = train_data.drop(columns=['Birthweight'])
y_train = train_data['Birthweight']

X_test = test_data.drop(columns=['Birthweight'])
y_test = test_data['Birthweight']

# Chuẩn hóa dữ liệu sử dụng StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Xác nhận kích thước dữ liệu sau khi chuẩn hóa


print("Kích thước X_train_scaled:", X_train_scaled.shape)
print("Kích thước X_test_scaled:", X_test_scaled.shape)
print("Kích thước y_train:", y_train.shape)
print("Kích thước y_test:", y_test.shape)

Kích thước X_train_scaled: (29, 16)


Kích thước X_test_scaled: (13, 16)
Kích thước y_train: (29,)
Kích thước y_test: (13,)
#Task 2.2. Apply Linear Regression to the training set/test set obtained in Task 2.1. Report the
perfomrance of the model on test set in termns of RMSE, R^2
[57]: data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset2_test.
↪csv', sep=';')

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset2_train.


↪csv', sep=';')

if 'Birthweight' in data.columns:
X = data.drop(columns=['Birthweight', 'id'])
y = data['Birthweight']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣


↪random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

14
X_test_scaled = scaler.transform(X_test)

print(f"training set: {X_train_scaled.shape}, set test: {X_test_scaled.


↪shape}")

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))


r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

else:
print("'Birthweight' column not found in the DataFrame. Please verify the␣
↪CSV file.")

print(f"Available columns: {data.columns.tolist()}")

training set: (23, 16), set test: (6, 16)


RMSE: 1.3006388228482793
R^2: -0.0016415873356199207
#Task 2.3. Apply feature selection to find the most correlated feature to the target (y) using
F-Regression and inspect the p-values for each feature.
• Then, fit a second regression model using only the significant features (p<=0.05). How does
the performance (i.e., RMSE, R^2) of the model change?

[58]: data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset2_test.


↪csv', sep=';')

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset2_train.


↪csv', sep=';')

if 'Birthweight' in data.columns:

X = data.drop(columns=['Birthweight', 'id'])
y = data['Birthweight']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣


↪random_state=42)

scaler = StandardScaler()

15
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# sử dụng F-Regression
f_values, p_values = f_regression(X_train_scaled, y_train)

# In p-value cho mỗi đặc trưng


feature_names = X.columns
for feature, p_value in zip(feature_names, p_values):
print(f"Feature: {feature}, p-value: {p_value}")

# p-value <= 0.05


significant_features = [feature for feature, p_value in zip(feature_names,␣
↪p_values) if p_value <= 0.05]

print("Significant features:", significant_features)

X_train_significant = X_train_scaled[:, [i for i, p in enumerate(p_values)␣


↪if p <= 0.05]]

X_test_significant = X_test_scaled[:, [i for i, p in enumerate(p_values) if␣


↪p <= 0.05]]

model_significant = LinearRegression()
model_significant.fit(X_train_significant, y_train)

y_pred_significant = model_significant.predict(X_test_significant)

# Tính RMSE và R² cho mô hình thứ hai


rmse_significant = np.sqrt(mean_squared_error(y_test, y_pred_significant))
r2_significant = r2_score(y_test, y_pred_significant)

print(f"RMSE với đặc trưng có ý nghĩa: {rmse_significant}")


print(f"R² với đặc trưng có ý nghĩa: {r2_significant}")

else:
print("Không tìm thấy cột 'Birthweight' trong DataFrame. Vui lòng kiểm tra␣
↪lại tệp CSV.")

print(f"Các cột có sẵn: {data.columns.tolist()}")

Feature: LowBirthWeight = Low, p-value: 0.1050528581711928


Feature: LowBirthWeight = Normal, p-value: 0.1050528581711928
Feature: headcirumference, p-value: 5.008320281668956e-06
Feature: length, p-value: 0.00046266616203449685
Feature: Gestation, p-value: 0.00877344993996407
Feature: smoker, p-value: 0.3240497275363667
Feature: motherage, p-value: 0.7604666499658355
Feature: mnocig, p-value: 0.13516981879131684

16
Feature: mheight, p-value: 0.1491617260027474
Feature: mppwt, p-value: 0.27121046212680633
Feature: fage, p-value: 0.5585513524547902
Feature: fedyrs, p-value: 0.8139728510535685
Feature: fnocig, p-value: 0.49878233204021105
Feature: fheight, p-value: 0.3776218690169165
Feature: lowbwt, p-value: 0.1050528581711928
Feature: mage35, p-value: 0.8872871284644785
Significant features: ['headcirumference', 'length', 'Gestation']
RMSE với đặc trưng có ý nghĩa: 1.1622289098666954
R² với đặc trưng có ý nghĩa: 0.20019839800201666
#Task 3. With dataset3 The dataset3 included 12 continuous attributes and 1 binary attribute
concerning the price of houses.
#Task 3.1. Load dataset3, apply StandardScaler() to standardize datataset and repare trainning
set, test set
[16]: train_data = pd.read_csv('dataset3_train.csv', sep=';')
test_data = pd.read_csv('dataset3_test.csv', sep=';')

combined_data = pd.concat([train_data, test_data], ignore_index=True)

X = combined_data.drop(columns=['MEDV'])
y = combined_data['MEDV']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(


X_scaled, y, test_size=len(test_data), random_state=42
)

print("Training set size:", X_train_scaled.shape)


print("Test set size:", X_test_scaled.shape)

Training set size: (354, 12)


Test set size: (152, 12)
#Task 3.2. Apply LinearRegression, Ridge, KNeighborsRegressor, DecisionTreeRegres-
sor, MLPRegressor to train models using above training set
[28]: train_data = pd.read_csv('dataset3_train.csv', sep=';')
test_data = pd.read_csv('dataset3_test.csv', sep=';')

17
combined_data = pd.concat([train_data, test_data], ignore_index=True)

X = combined_data.drop(columns=['MEDV'])
y = combined_data['MEDV']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(


X_scaled, y, test_size=len(test_data), random_state=42
)

models = {
"Linear Regression": LinearRegression(),
"Ridge": Ridge(),
"KNeighbors Regressor": KNeighborsRegressor(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"MLP Regressor": MLPRegressor(max_iter=1000)
}

for name, model in models.items():


model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, predictions)
print(f"{name} - Mean Squared Error: {mse:.2f}")

Linear Regression - Mean Squared Error: 16.25


Ridge - Mean Squared Error: 16.27
KNeighbors Regressor - Mean Squared Error: 19.89
Decision Tree Regressor - Mean Squared Error: 17.07
MLP Regressor - Mean Squared Error: 9.21
/usr/local/lib/python3.10/dist-
packages/sklearn/neural_network/_multilayer_perceptron.py:690:
ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and
the optimization hasn't converged yet.
warnings.warn(
#Task 3.3. Report the performance of models using metrics RMSE, R^2 (as PrettyTable)

[31]: train_data = pd.read_csv('dataset3_train.csv', sep=';')


test_data = pd.read_csv('dataset3_test.csv', sep=';')

combined_data = pd.concat([train_data, test_data], ignore_index=True)

18
X = combined_data.drop(columns=['MEDV'])
y = combined_data['MEDV']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(


X_scaled, y, test_size=len(test_data), random_state=42
)

models = {
"Linear Regression": LinearRegression(),
"Ridge": Ridge(),
"KNeighbors Regressor": KNeighborsRegressor(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"MLP Regressor": MLPRegressor(max_iter=1000)
}

table = PrettyTable()
table.field_names = ["Model", "RMSE", "R^2"]

for name, model in models.items():


model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, predictions))


r2 = r2_score(y_test, predictions)

table.add_row([name, round(rmse, 2), round(r2, 2)])

print(table)

+-------------------------+------+------+
| Model | RMSE | R^2 |
+-------------------------+------+------+
| Linear Regression | 4.03 | 0.81 |
| Ridge | 4.03 | 0.81 |
| KNeighbors Regressor | 4.46 | 0.77 |
| Decision Tree Regressor | 4.01 | 0.81 |
| MLP Regressor | 3.05 | 0.89 |
+-------------------------+------+------+
/usr/local/lib/python3.10/dist-
packages/sklearn/neural_network/_multilayer_perceptron.py:690:
ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and

19
the optimization hasn't converged yet.
warnings.warn(

20

You might also like