21brs1474 ML Lab 2

Download as pdf or txt
Download as pdf or txt
You are on page 1of 25

21brs1474-ml-lab-2

July 26, 2024

[6]: import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale␣
↪= 1.5, color_codes=True)

import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt

ad_data = pd.read_csv('/content/Regression3.csv')

ad_data.info()

ad_data.describe()

p = sns.pairplot(ad_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 X1 209 non-null int64
1 X2 209 non-null int64
2 X3 209 non-null int64
3 X4 209 non-null int64
4 X4.1 209 non-null int64
5 X6 209 non-null int64
6 X7 209 non-null int64
7 Y 209 non-null int64
dtypes: int64(8)
memory usage: 13.2 KB

1
[8]: p = sns.pairplot(ad_data, x_vars=['X1','X2','X3','X4','X4.1','X6','X7'],␣
↪y_vars='Y', size=7, aspect=0.7)

2
[12]: x = ad_data.drop(["Y"],axis=1)
y = ad_data.Y

[13]: from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X = sc.fit_transform(x)

[14]: from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y,random_state =␣
↪0,test_size=0.25)

[15]: from sklearn.metrics import mean_absolute_error


from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred = regr.predict(X_train)

[16]: print("R squared: {}".format(r2_score(y_true=y_train,y_pred=y_pred)))

R squared: 0.9657892242175452

[17]: residuals = y_train.values-y_pred


mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))

Mean of Residuals -1.166018848426831e-14

[26]: p = sns.scatterplot(x='y_pred', y='residuals', data=df)


plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-50, 50)
plt.xlim(0, 26)
plt.plot([0, 26], [0, 0], color='blue') # Replaced sns.lineplot with plt.plot
plt.title('Residuals vs fitted values plot for homoscedasticity check')
plt.show()

3
[27]: import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)

[27]: [('F statistic', 0.8775955315176062), ('p-value', 0.7081442994879754)]

[30]: from scipy.stats import bartlett


test = bartlett(X_train.ravel(), residuals) # Flatten X_train to a 1D array, no␣
↪need for .values

print(test)

BartlettResult(statistic=4779.093717613901, pvalue=0.0)

[31]: p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')

4
[38]: plt.figure(figsize=(10,5))
# Pass x and y as keyword arguments, and rename p to avoid overwriting the plot␣
↪function

line_plot = sns.lineplot(x=y_pred, y=residuals, marker='o', color='blue')


plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')

# Set x-axis limits *before* plotting the horizontal line


plt.xlim(0,26)

# Pass x and y as lists within a dictionary for the second lineplot


another_line_plot = sns.lineplot(data={'x':[0,26], 'y':[0,0]}, color='red')

plt.ylim(-50,50)
plt.title('Residuals vs fitted values plot for autocorrelation check')
plt.show()

5
[42]: from statsmodels.stats import diagnostic as diag
result = diag.acorr_ljungbox(residuals , lags = 40)
if len(result.columns) > 1: # Check if DataFrame has multiple columns
min_p_value = result['lb_pvalue'].min() # Access 'lb_pvalue' column
print(min_p_value)
else:
print("The result of acorr_ljungbox does not have a second element.")

0.0634059439491035

[43]: import statsmodels.api as sm

[44]: sm.graphics.tsa.plot_acf(residuals, lags=40)


plt.show()

6
[45]: sm.graphics.tsa.plot_pacf(residuals, lags=40)
plt.show()

7
[46]: plt.figure(figsize=(20,20)) # on this line I just set the size of figure to 12␣
↪by 10.

p=sns.heatmap(ad_data.corr(), annot=True,cmap='RdYlGn',square=True) # seaborn␣


↪has very simple solution for heatmap

8
[47]: from sklearn.tree import DecisionTreeRegressor

dec_tree = DecisionTreeRegressor(random_state=0)
dec_tree.fit(X_train,y_train)
dec_tree_y_pred = dec_tree.predict(X_train)
print("Accuracy: {}".format(dec_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=dec_tree_y_pred)))

Accuracy: 1.0
R squared: 1.0

9
[48]: from sklearn.ensemble import RandomForestRegressor

rf_tree = RandomForestRegressor(random_state=0)
rf_tree.fit(X_train,y_train)
rf_tree_y_pred = rf_tree.predict(X_train)
print("Accuracy: {}".format(rf_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=rf_tree_y_pred)))

Accuracy: 0.9879262687432366
R squared: 0.9879262687432366

[49]: from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train,y_train)
svr_y_pred = svr.predict(X_train)
print("Accuracy: {}".format(svr.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=svr_y_pred)))

Accuracy: -0.01607994872969165
R squared: -0.01607994872969165

[50]: # Evaluate Linear Regression


y_pred_test = regr.predict(X_test)
print("Linear Regression:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_test))
print("R-squared (R^2):", r2_score(y_test, y_pred_test))
print("\n")

# Evaluate Decision Tree


dec_tree_y_pred_test = dec_tree.predict(X_test)
print("Decision Tree:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test,␣
↪dec_tree_y_pred_test))

print("Mean Squared Error (MSE):", mean_squared_error(y_test,␣


↪dec_tree_y_pred_test))

print("R-squared (R^2):", r2_score(y_test, dec_tree_y_pred_test))


print("\n")

# Evaluate Random Forest


rf_tree_y_pred_test = rf_tree.predict(X_test)
print("Random Forest:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test,␣
↪rf_tree_y_pred_test))

print("Mean Squared Error (MSE):", mean_squared_error(y_test,␣


↪rf_tree_y_pred_test))

10
print("R-squared (R^2):", r2_score(y_test, rf_tree_y_pred_test))
print("\n")

# Evaluate Support Vector Regression


svr_y_pred_test = svr.predict(X_test)
print("Support Vector Regression:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test,␣
↪svr_y_pred_test))

print("Mean Squared Error (MSE):", mean_squared_error(y_test, svr_y_pred_test))


print("R-squared (R^2):", r2_score(y_test, svr_y_pred_test))

Linear Regression:
Mean Absolute Error (MAE): 22.818438000992135
Mean Squared Error (MSE): 1090.7880443734675
R-squared (R^2): 0.9014501379286975

Decision Tree:
Mean Absolute Error (MAE): 16.18867924528302
Mean Squared Error (MSE): 1239.132075471698
R-squared (R^2): 0.8880476406431422

Random Forest:
Mean Absolute Error (MAE): 13.44433962264151
Mean Squared Error (MSE): 1073.5168018867928
R-squared (R^2): 0.9030105497553962

Support Vector Regression:


Mean Absolute Error (MAE): 44.35493694171242
Mean Squared Error (MSE): 10546.944208975867
R-squared (R^2): 0.04711102910436715

[51]: import pandas as pd


import matplotlib.pyplot as plt
import numpy as np
# a. Mean of Residuals
residuals = y_train - y_pred
mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))

# b. Check for Homoscedasticity


p = sns.scatterplot(x='y_pred', y='residuals', data=pd.DataFrame({'y_pred':␣
↪y_pred, 'residuals': residuals}))

plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')

11
plt.ylim(-50, 50)
plt.xlim(0, 26)
plt.plot([0, 26], [0, 0], color='blue')
plt.title('Residuals vs fitted values plot for homoscedasticity check')
plt.show()

name = ['F statistic', 'p-value']


test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)

# c. Check for Normality of error terms/residuals


p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')

# d. No autocorrelation of residuals
plt.figure(figsize=(10,5))
sns.lineplot(x=y_pred, y=residuals, marker='o', color='blue')
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.xlim(0,26)
sns.lineplot(data={'x':[0,26], 'y':[0,0]}, color='red')
plt.ylim(-50,50)
plt.title('Residuals vs fitted values plot for autocorrelation check')
plt.show()

result = diag.acorr_ljungbox(residuals , lags = 40)


print(result)
sm.graphics.tsa.plot_acf(residuals, lags=40)
plt.show()
sm.graphics.tsa.plot_pacf(residuals, lags=40)
plt.show()

# e. No perfect multicollinearity
plt.figure(figsize=(20,20))
p=sns.heatmap(ad_data.corr(), annot=True,cmap='RdYlGn',square=True)

Mean of Residuals -1.166018848426831e-14

12
13
lb_stat lb_pvalue
1 3.445976 0.063406
2 3.841507 0.146497
3 3.862897 0.276654
4 4.055939 0.398488
5 4.056045 0.541375
6 6.149900 0.406610
7 6.623357 0.469120
8 8.695414 0.368638
9 9.784872 0.368180
10 12.432321 0.257164
11 12.625875 0.318479
12 15.258457 0.227606
13 17.323003 0.184956
14 17.351213 0.237944
15 17.351245 0.298304
16 17.476452 0.355432
17 19.121962 0.321577
18 20.900772 0.284449
19 22.071802 0.280697
20 22.094391 0.335411
21 22.317215 0.381432
22 23.712936 0.362455
23 24.365714 0.383795

14
24 24.822061 0.415481
25 25.794042 0.418654
26 25.903830 0.468401
27 26.541119 0.488750
28 26.970155 0.519873
29 27.253867 0.558010
30 27.574405 0.592979
31 27.927626 0.624899
32 28.985610 0.619894
33 30.859367 0.574087
34 33.422901 0.495736
35 33.475086 0.541772
36 33.483268 0.588883
37 33.485034 0.634610
38 33.513072 0.676927
39 33.911212 0.700877
40 42.353219 0.369803

15
16
[52]: import matplotlib.pyplot as plt
# Calculate correlations with the target variable
correlations = ad_data.corr()['Y'].sort_values(ascending=False)
print(correlations)

# Plot correlations
plt.figure(figsize=(10, 6))
correlations.drop('Y').plot(kind='bar')
plt.title('Correlation with Target Variable (Y)')
plt.xlabel('Independent Variables')
plt.ylabel('Correlation Coefficient')
plt.show()

17
Y 1.000000
X7 0.966472
X3 0.901202
X2 0.819292
X4 0.648620
X4.1 0.610580
X6 0.592156
X1 -0.288396
Name: Y, dtype: float64

[54]: import matplotlib.pyplot as plt


import numpy as np
for col in x.columns:
plt.figure()
sns.regplot(x=x[col], y=y, scatter_kws={'alpha':0.5})
plt.title(f"Linearity Check: {col} vs Y")
plt.show()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regr, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print("Cross-validated RMSE scores:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

18
19
20
21
22
23
24
Cross-validated RMSE scores: [70.69045924 19.30174199 35.34990181 33.80403957
45.35382374]
Average RMSE: 40.89999326980918

25

You might also like