21brs1474 ML Lab 2
21brs1474 ML Lab 2
21brs1474 ML Lab 2
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt
ad_data = pd.read_csv('/content/Regression3.csv')
ad_data.info()
ad_data.describe()
p = sns.pairplot(ad_data)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 X1 209 non-null int64
1 X2 209 non-null int64
2 X3 209 non-null int64
3 X4 209 non-null int64
4 X4.1 209 non-null int64
5 X6 209 non-null int64
6 X7 209 non-null int64
7 Y 209 non-null int64
dtypes: int64(8)
memory usage: 13.2 KB
1
[8]: p = sns.pairplot(ad_data, x_vars=['X1','X2','X3','X4','X4.1','X6','X7'],␣
↪y_vars='Y', size=7, aspect=0.7)
2
[12]: x = ad_data.drop(["Y"],axis=1)
y = ad_data.Y
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred = regr.predict(X_train)
R squared: 0.9657892242175452
3
[27]: import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)
print(test)
BartlettResult(statistic=4779.093717613901, pvalue=0.0)
[31]: p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')
4
[38]: plt.figure(figsize=(10,5))
# Pass x and y as keyword arguments, and rename p to avoid overwriting the plot␣
↪function
plt.ylim(-50,50)
plt.title('Residuals vs fitted values plot for autocorrelation check')
plt.show()
5
[42]: from statsmodels.stats import diagnostic as diag
result = diag.acorr_ljungbox(residuals , lags = 40)
if len(result.columns) > 1: # Check if DataFrame has multiple columns
min_p_value = result['lb_pvalue'].min() # Access 'lb_pvalue' column
print(min_p_value)
else:
print("The result of acorr_ljungbox does not have a second element.")
0.0634059439491035
6
[45]: sm.graphics.tsa.plot_pacf(residuals, lags=40)
plt.show()
7
[46]: plt.figure(figsize=(20,20)) # on this line I just set the size of figure to 12␣
↪by 10.
8
[47]: from sklearn.tree import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor(random_state=0)
dec_tree.fit(X_train,y_train)
dec_tree_y_pred = dec_tree.predict(X_train)
print("Accuracy: {}".format(dec_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=dec_tree_y_pred)))
Accuracy: 1.0
R squared: 1.0
9
[48]: from sklearn.ensemble import RandomForestRegressor
rf_tree = RandomForestRegressor(random_state=0)
rf_tree.fit(X_train,y_train)
rf_tree_y_pred = rf_tree.predict(X_train)
print("Accuracy: {}".format(rf_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=rf_tree_y_pred)))
Accuracy: 0.9879262687432366
R squared: 0.9879262687432366
svr = SVR()
svr.fit(X_train,y_train)
svr_y_pred = svr.predict(X_train)
print("Accuracy: {}".format(svr.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=svr_y_pred)))
Accuracy: -0.01607994872969165
R squared: -0.01607994872969165
10
print("R-squared (R^2):", r2_score(y_test, rf_tree_y_pred_test))
print("\n")
Linear Regression:
Mean Absolute Error (MAE): 22.818438000992135
Mean Squared Error (MSE): 1090.7880443734675
R-squared (R^2): 0.9014501379286975
Decision Tree:
Mean Absolute Error (MAE): 16.18867924528302
Mean Squared Error (MSE): 1239.132075471698
R-squared (R^2): 0.8880476406431422
Random Forest:
Mean Absolute Error (MAE): 13.44433962264151
Mean Squared Error (MSE): 1073.5168018867928
R-squared (R^2): 0.9030105497553962
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
11
plt.ylim(-50, 50)
plt.xlim(0, 26)
plt.plot([0, 26], [0, 0], color='blue')
plt.title('Residuals vs fitted values plot for homoscedasticity check')
plt.show()
# d. No autocorrelation of residuals
plt.figure(figsize=(10,5))
sns.lineplot(x=y_pred, y=residuals, marker='o', color='blue')
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.xlim(0,26)
sns.lineplot(data={'x':[0,26], 'y':[0,0]}, color='red')
plt.ylim(-50,50)
plt.title('Residuals vs fitted values plot for autocorrelation check')
plt.show()
# e. No perfect multicollinearity
plt.figure(figsize=(20,20))
p=sns.heatmap(ad_data.corr(), annot=True,cmap='RdYlGn',square=True)
12
13
lb_stat lb_pvalue
1 3.445976 0.063406
2 3.841507 0.146497
3 3.862897 0.276654
4 4.055939 0.398488
5 4.056045 0.541375
6 6.149900 0.406610
7 6.623357 0.469120
8 8.695414 0.368638
9 9.784872 0.368180
10 12.432321 0.257164
11 12.625875 0.318479
12 15.258457 0.227606
13 17.323003 0.184956
14 17.351213 0.237944
15 17.351245 0.298304
16 17.476452 0.355432
17 19.121962 0.321577
18 20.900772 0.284449
19 22.071802 0.280697
20 22.094391 0.335411
21 22.317215 0.381432
22 23.712936 0.362455
23 24.365714 0.383795
14
24 24.822061 0.415481
25 25.794042 0.418654
26 25.903830 0.468401
27 26.541119 0.488750
28 26.970155 0.519873
29 27.253867 0.558010
30 27.574405 0.592979
31 27.927626 0.624899
32 28.985610 0.619894
33 30.859367 0.574087
34 33.422901 0.495736
35 33.475086 0.541772
36 33.483268 0.588883
37 33.485034 0.634610
38 33.513072 0.676927
39 33.911212 0.700877
40 42.353219 0.369803
15
16
[52]: import matplotlib.pyplot as plt
# Calculate correlations with the target variable
correlations = ad_data.corr()['Y'].sort_values(ascending=False)
print(correlations)
# Plot correlations
plt.figure(figsize=(10, 6))
correlations.drop('Y').plot(kind='bar')
plt.title('Correlation with Target Variable (Y)')
plt.xlabel('Independent Variables')
plt.ylabel('Correlation Coefficient')
plt.show()
17
Y 1.000000
X7 0.966472
X3 0.901202
X2 0.819292
X4 0.648620
X4.1 0.610580
X6 0.592156
X1 -0.288396
Name: Y, dtype: float64
18
19
20
21
22
23
24
Cross-validated RMSE scores: [70.69045924 19.30174199 35.34990181 33.80403957
45.35382374]
Average RMSE: 40.89999326980918
25