lab-5-nguyenngocmaithi-20130120
lab-5-nguyenngocmaithi-20130120
lab-5-nguyenngocmaithi-20130120
1 DM Lab 5: Regression
3 Regression
• Classification predicts a categorical value
– a finite set of values
• Regression predicts a numerical value
– a possibly infinite set of values
– can be interpolating or extrapolating
Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/Lab5
#Import libraries
[12]: import pandas as pd
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
1
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, SelectFwe
from sklearn.feature_selection import f_regression
print(data.columns)
2
3
4
5
6
7
#Task 1.3. Apply LinearRegression to which combinations that have linear relationship.
[33]: def perform_linear_regression(X, y):
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
X_train = train_set[['age']]
y_train = train_set['weight']
model_weight, y_pred_weight = perform_linear_regression(X_train, y_train)
plt.figure(figsize=(8, 6))
plt.scatter(train_set['age'], train_set['weight'], color='blue', label='Actual␣
↪data')
8
plt.plot(train_set['age'], y_pred_weight, color='red', label='Regression line')
plt.title('Linear Regression: Age vs Weight')
plt.xlabel('Age')
plt.ylabel('Weight')
plt.legend()
plt.show()
[ ]: y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.scatter(y_train, y_train_pred, color='blue', alpha=0.6)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--',␣
↪lw=3)
9
plt.title('Training Set: Actual vs Predicted')
plt.xlabel('Actual Values (Train)')
plt.ylabel('Predicted Values (Train)')
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_test_pred, color='green', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--',␣
↪lw=3)
plt.subplot(1, 3, 3)
plt.scatter(X_train[features[0]], y_train, color='blue', label='Train', alpha=0.
↪6)
#Task 1.5. Apply Polynomial regression to which combinations that have non-linear relationship
10
[ ]: dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Lab5/dataset1.
↪csv')
print(dataset.columns)
sns.scatterplot(x='age', y='length', data=dataset)
plt.title('Scatter plot of age vs length')
plt.show()
[ ]: poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)
y_train_pred = model_poly.predict(X_train_poly)
y_test_pred = model_poly.predict(X_test_poly)
11
print(f"Mean Squared Error (Train): {mean_squared_error(y_train,␣
↪y_train_pred)}")
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X, y)
12
knn = KNeighborsRegressor()
param_grid = {
'n_neighbors': np.arange(1, 21)
}
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False,␣
↪squared=False)
grid_search = GridSearchCV(estimator=knn,
param_grid=param_grid,
cv=10,
scoring=rmse_scorer,
n_jobs=-1)
grid_search.fit(X, y)
print(f"Best parameter (k): {grid_search.best_params_['n_neighbors']}")
print(f"Best RMSE score: {-grid_search.best_score_:.4f}")
results = grid_search.cv_results_
13
[55]: train_data = pd.read_csv('dataset2_train.csv', delimiter=';', engine='python')
test_data = pd.read_csv('dataset2_test.csv', delimiter=';', engine='python')
# Bỏ cột 'id'
train_data = train_data.drop(columns=['id'], errors='ignore')
test_data = test_data.drop(columns=['id'], errors='ignore')
X_test = test_data.drop(columns=['Birthweight'])
y_test = test_data['Birthweight']
if 'Birthweight' in data.columns:
X = data.drop(columns=['Birthweight', 'id'])
y = data['Birthweight']
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
14
X_test_scaled = scaler.transform(X_test)
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")
else:
print("'Birthweight' column not found in the DataFrame. Please verify the␣
↪CSV file.")
if 'Birthweight' in data.columns:
X = data.drop(columns=['Birthweight', 'id'])
y = data['Birthweight']
scaler = StandardScaler()
15
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# sử dụng F-Regression
f_values, p_values = f_regression(X_train_scaled, y_train)
model_significant = LinearRegression()
model_significant.fit(X_train_significant, y_train)
y_pred_significant = model_significant.predict(X_test_significant)
else:
print("Không tìm thấy cột 'Birthweight' trong DataFrame. Vui lòng kiểm tra␣
↪lại tệp CSV.")
16
Feature: mheight, p-value: 0.1491617260027474
Feature: mppwt, p-value: 0.27121046212680633
Feature: fage, p-value: 0.5585513524547902
Feature: fedyrs, p-value: 0.8139728510535685
Feature: fnocig, p-value: 0.49878233204021105
Feature: fheight, p-value: 0.3776218690169165
Feature: lowbwt, p-value: 0.1050528581711928
Feature: mage35, p-value: 0.8872871284644785
Significant features: ['headcirumference', 'length', 'Gestation']
RMSE với đặc trưng có ý nghĩa: 1.1622289098666954
R² với đặc trưng có ý nghĩa: 0.20019839800201666
#Task 3. With dataset3 The dataset3 included 12 continuous attributes and 1 binary attribute
concerning the price of houses.
#Task 3.1. Load dataset3, apply StandardScaler() to standardize datataset and repare trainning
set, test set
[16]: train_data = pd.read_csv('dataset3_train.csv', sep=';')
test_data = pd.read_csv('dataset3_test.csv', sep=';')
X = combined_data.drop(columns=['MEDV'])
y = combined_data['MEDV']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
17
combined_data = pd.concat([train_data, test_data], ignore_index=True)
X = combined_data.drop(columns=['MEDV'])
y = combined_data['MEDV']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
models = {
"Linear Regression": LinearRegression(),
"Ridge": Ridge(),
"KNeighbors Regressor": KNeighborsRegressor(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"MLP Regressor": MLPRegressor(max_iter=1000)
}
18
X = combined_data.drop(columns=['MEDV'])
y = combined_data['MEDV']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
models = {
"Linear Regression": LinearRegression(),
"Ridge": Ridge(),
"KNeighbors Regressor": KNeighborsRegressor(),
"Decision Tree Regressor": DecisionTreeRegressor(),
"MLP Regressor": MLPRegressor(max_iter=1000)
}
table = PrettyTable()
table.field_names = ["Model", "RMSE", "R^2"]
print(table)
+-------------------------+------+------+
| Model | RMSE | R^2 |
+-------------------------+------+------+
| Linear Regression | 4.03 | 0.81 |
| Ridge | 4.03 | 0.81 |
| KNeighbors Regressor | 4.46 | 0.77 |
| Decision Tree Regressor | 4.01 | 0.81 |
| MLP Regressor | 3.05 | 0.89 |
+-------------------------+------+------+
/usr/local/lib/python3.10/dist-
packages/sklearn/neural_network/_multilayer_perceptron.py:690:
ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1000) reached and
19
the optimization hasn't converged yet.
warnings.warn(
20