21brs1474 ML Lab 2

21brs1474-ml-lab-2
July 26, 2024
[6]: import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale␣
↪= 1.5, color_codes=True)
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt
ad_data = pd.read_csv('/content/Regression3.csv')
ad_data.info()
ad_data.describe()
p = sns.pairplot(ad_data)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 X1 209 non-null int64
4 X4.1 209 non-null int64
7 Y 209 non-null int64
dtypes: int64(8)
memory usage: 13.2 KB
1
[8]: p = sns.pairplot(ad_data, x_vars=['X1','X2','X3','X4','X4.1','X6','X7'],␣
↪y_vars='Y', size=7, aspect=0.7)
2
[12]: x = ad_data.drop(["Y"],axis=1)
y = ad_data.Y
[13]: from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(x)
[14]: from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state =␣
↪0,test_size=0.25)
[15]: from sklearn.metrics import mean_absolute_error

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
y_pred = regr.predict(X_train)
[16]: print("R squared: {}".format(r2_score(y_true=y_train,y_pred=y_pred)))
R squared: 0.9657892242175452
[17]: residuals = y_train.values-y_pred

mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))
Mean of Residuals -1.166018848426831e-14
[26]: p = sns.scatterplot(x='y_pred', y='residuals', data=df)

plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-50, 50)
plt.xlim(0, 26)
plt.plot([0, 26], [0, 0], color='blue') # Replaced sns.lineplot with plt.plot
plt.title('Residuals vs fitted values plot for homoscedasticity check')
plt.show()
3
[27]: import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)
[27]: [('F statistic', 0.8775955315176062), ('p-value', 0.7081442994879754)]
[30]: from scipy.stats import bartlett

test = bartlett(X_train.ravel(), residuals) # Flatten X_train to a 1D array, no␣
↪need for .values
print(test)
BartlettResult(statistic=4779.093717613901, pvalue=0.0)
[31]: p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')
4
[38]: plt.figure(figsize=(10,5))
# Pass x and y as keyword arguments, and rename p to avoid overwriting the plot␣
↪function
line_plot = sns.lineplot(x=y_pred, y=residuals, marker='o', color='blue')

# Set x-axis limits *before* plotting the horizontal line

plt.xlim(0,26)
# Pass x and y as lists within a dictionary for the second lineplot

another_line_plot = sns.lineplot(data={'x':[0,26], 'y':[0,0]}, color='red')
plt.ylim(-50,50)
plt.title('Residuals vs fitted values plot for autocorrelation check')
plt.show()
5
[42]: from statsmodels.stats import diagnostic as diag
result = diag.acorr_ljungbox(residuals , lags = 40)
if len(result.columns) > 1: # Check if DataFrame has multiple columns
min_p_value = result['lb_pvalue'].min() # Access 'lb_pvalue' column
print(min_p_value)
else:
print("The result of acorr_ljungbox does not have a second element.")
0.0634059439491035
[43]: import statsmodels.api as sm
[44]: sm.graphics.tsa.plot_acf(residuals, lags=40)

plt.show()
6
[45]: sm.graphics.tsa.plot_pacf(residuals, lags=40)
plt.show()
7
[46]: plt.figure(figsize=(20,20)) # on this line I just set the size of figure to 12␣
↪by 10.
p=sns.heatmap(ad_data.corr(), annot=True,cmap='RdYlGn',square=True) # seaborn␣

↪has very simple solution for heatmap
8
[47]: from sklearn.tree import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor(random_state=0)
dec_tree.fit(X_train,y_train)
dec_tree_y_pred = dec_tree.predict(X_train)
print("Accuracy: {}".format(dec_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=dec_tree_y_pred)))
Accuracy: 1.0
R squared: 1.0
9
[48]: from sklearn.ensemble import RandomForestRegressor
rf_tree = RandomForestRegressor(random_state=0)
rf_tree.fit(X_train,y_train)
rf_tree_y_pred = rf_tree.predict(X_train)
print("Accuracy: {}".format(rf_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=rf_tree_y_pred)))
Accuracy: 0.9879262687432366
R squared: 0.9879262687432366
[49]: from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train,y_train)
svr_y_pred = svr.predict(X_train)
print("Accuracy: {}".format(svr.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=svr_y_pred)))
Accuracy: -0.01607994872969165
R squared: -0.01607994872969165
[50]: # Evaluate Linear Regression

y_pred_test = regr.predict(X_test)
print("Linear Regression:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_test))
print("R-squared (R^2):", r2_score(y_test, y_pred_test))
print("\n")
# Evaluate Decision Tree

dec_tree_y_pred_test = dec_tree.predict(X_test)
print("Decision Tree:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test,␣
↪dec_tree_y_pred_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test,␣

↪dec_tree_y_pred_test))
print("R-squared (R^2):", r2_score(y_test, dec_tree_y_pred_test))

print("\n")
# Evaluate Random Forest

rf_tree_y_pred_test = rf_tree.predict(X_test)
print("Random Forest:")
↪rf_tree_y_pred_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test,␣

↪rf_tree_y_pred_test))
10
print("R-squared (R^2):", r2_score(y_test, rf_tree_y_pred_test))
print("\n")
# Evaluate Support Vector Regression

svr_y_pred_test = svr.predict(X_test)
print("Support Vector Regression:")
↪svr_y_pred_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, svr_y_pred_test))

print("R-squared (R^2):", r2_score(y_test, svr_y_pred_test))
Linear Regression:
Mean Absolute Error (MAE): 22.818438000992135
Mean Squared Error (MSE): 1090.7880443734675
R-squared (R^2): 0.9014501379286975
Decision Tree:
R-squared (R^2): 0.8880476406431422
Random Forest:
R-squared (R^2): 0.9030105497553962
Support Vector Regression:

R-squared (R^2): 0.04711102910436715
[51]: import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
# a. Mean of Residuals
residuals = y_train - y_pred
mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))
# b. Check for Homoscedasticity

p = sns.scatterplot(x='y_pred', y='residuals', data=pd.DataFrame({'y_pred':␣
↪y_pred, 'residuals': residuals}))
11
plt.ylim(-50, 50)
plt.xlim(0, 26)
plt.plot([0, 26], [0, 0], color='blue')
plt.title('Residuals vs fitted values plot for homoscedasticity check')
plt.show()
name = ['F statistic', 'p-value']

test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)
# c. Check for Normality of error terms/residuals

p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')
# d. No autocorrelation of residuals
plt.figure(figsize=(10,5))
sns.lineplot(x=y_pred, y=residuals, marker='o', color='blue')
plt.xlim(0,26)
sns.lineplot(data={'x':[0,26], 'y':[0,0]}, color='red')
plt.ylim(-50,50)
plt.title('Residuals vs fitted values plot for autocorrelation check')
plt.show()
result = diag.acorr_ljungbox(residuals , lags = 40)

print(result)
sm.graphics.tsa.plot_acf(residuals, lags=40)
plt.show()
sm.graphics.tsa.plot_pacf(residuals, lags=40)
plt.show()
# e. No perfect multicollinearity
plt.figure(figsize=(20,20))
p=sns.heatmap(ad_data.corr(), annot=True,cmap='RdYlGn',square=True)
Mean of Residuals -1.166018848426831e-14
12
13
lb_stat lb_pvalue
1 3.445976 0.063406
2 3.841507 0.146497
3 3.862897 0.276654
4 4.055939 0.398488
5 4.056045 0.541375
6 6.149900 0.406610
7 6.623357 0.469120
8 8.695414 0.368638
9 9.784872 0.368180
10 12.432321 0.257164
11 12.625875 0.318479
12 15.258457 0.227606
13 17.323003 0.184956
14 17.351213 0.237944
15 17.351245 0.298304
16 17.476452 0.355432
17 19.121962 0.321577
18 20.900772 0.284449
19 22.071802 0.280697
20 22.094391 0.335411
21 22.317215 0.381432
22 23.712936 0.362455
23 24.365714 0.383795
14
24 24.822061 0.415481
25 25.794042 0.418654
26 25.903830 0.468401
27 26.541119 0.488750
28 26.970155 0.519873
29 27.253867 0.558010
30 27.574405 0.592979
31 27.927626 0.624899
32 28.985610 0.619894
33 30.859367 0.574087
34 33.422901 0.495736
35 33.475086 0.541772
36 33.483268 0.588883
37 33.485034 0.634610
38 33.513072 0.676927
39 33.911212 0.700877
40 42.353219 0.369803
15
16
[52]: import matplotlib.pyplot as plt
# Calculate correlations with the target variable
correlations = ad_data.corr()['Y'].sort_values(ascending=False)
print(correlations)
# Plot correlations
plt.figure(figsize=(10, 6))
correlations.drop('Y').plot(kind='bar')
plt.title('Correlation with Target Variable (Y)')
plt.xlabel('Independent Variables')
plt.ylabel('Correlation Coefficient')
plt.show()
17
Y 1.000000
X7 0.966472
X3 0.901202
X2 0.819292
X4 0.648620
X4.1 0.610580
X6 0.592156
X1 -0.288396
Name: Y, dtype: float64
[54]: import matplotlib.pyplot as plt

import numpy as np
for col in x.columns:
plt.figure()
sns.regplot(x=x[col], y=y, scatter_kws={'alpha':0.5})
plt.title(f"Linearity Check: {col} vs Y")
plt.show()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regr, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print("Cross-validated RMSE scores:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())
18
19
20
21
22
23
24
Cross-validated RMSE scores: [70.69045924 19.30174199 35.34990181 33.80403957
45.35382374]
Average RMSE: 40.89999326980918
25

21brs1474 ML Lab 2

Uploaded by

Copyright:

Available Formats

21brs1474 ML Lab 2

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

21brs1474 ML Lab 2

Uploaded by

Copyright:

Available Formats

21brs1474-ml-lab-2

July 26, 2024

[6]: import numpy as np # linear algebra

[13]: from sklearn.preprocessing import StandardScaler

[14]: from sklearn.model_selection import train_test_split

[15]: from sklearn.metrics import mean_absolute_error

[16]: print("R squared: {}".format(r2_score(y_true=y_train,y_pred=y_pred)))

[17]: residuals = y_train.values-y_pred

Mean of Residuals -1.166018848426831e-14

[26]: p = sns.scatterplot(x='y_pred', y='residuals', data=df)

[27]: [('F statistic', 0.8775955315176062), ('p-value', 0.7081442994879754)]

[30]: from scipy.stats import bartlett

line_plot = sns.lineplot(x=y_pred, y=residuals, marker='o', color='blue')

# Set x-axis limits *before* plotting the horizontal line

# Pass x and y as lists within a dictionary for the second lineplot

[43]: import statsmodels.api as sm

[44]: sm.graphics.tsa.plot_acf(residuals, lags=40)

p=sns.heatmap(ad_data.corr(), annot=True,cmap='RdYlGn',square=True) # seaborn␣

[49]: from sklearn.svm import SVR

[50]: # Evaluate Linear Regression

# Evaluate Decision Tree

print("Mean Squared Error (MSE):", mean_squared_error(y_test,␣

print("R-squared (R^2):", r2_score(y_test, dec_tree_y_pred_test))

# Evaluate Random Forest

print("Mean Squared Error (MSE):", mean_squared_error(y_test,␣

# Evaluate Support Vector Regression

print("Mean Squared Error (MSE):", mean_squared_error(y_test, svr_y_pred_test))

Support Vector Regression:

[51]: import pandas as pd

# b. Check for Homoscedasticity

name = ['F statistic', 'p-value']

# c. Check for Normality of error terms/residuals

result = diag.acorr_ljungbox(residuals , lags = 40)

Mean of Residuals -1.166018848426831e-14

[54]: import matplotlib.pyplot as plt

You might also like

# Set x-axis limits before plotting the horizontal line