You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hello everyone, I am using walkforward validation technique in RF to identify main drivers of GWL fluctuation. My problem is I'm getting the R2 value for testing to be negative number which probably indicates very poor performance. I understand the walkforward validation is a good technique for time series data. I do not know what I am doing wrong, or this validation method is not just fitting well for the data. I am using one year data(365) to train and 100 for test, I first tried 30 days data for testing. See my code below may be I'm doing something wrong. I will apreciate your feedback.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
Load the dataset
data = LTF15_dff.copy()
data.set_index('Date', inplace=True)
start = 0
end = len(data) - train_window - test_window
Expanding-window walkforward validation
for start in range(end):
# Define training and testing data
train = data.iloc[:start + train_window]
test = data.iloc[start + train_window:start + train_window + test_window]
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]
# Train Random Forest Regressor with manually tuned hyperparameters
model = RandomForestRegressor(
n_estimators=manual_params['n_estimators'],
max_depth=manual_params['max_depth'],
min_samples_split=manual_params['min_samples_split'],
min_samples_leaf=manual_params['min_samples_leaf'],
random_state=42
)
model.fit(X_train, y_train)
# Store feature importances for this slice
feature_importances.append(model.feature_importances_)
#predict on the train data
y_train_pred = model.predict(X_train)
# Calculate performance on train data
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
metrics['Train_RMSE'].append(train_rmse)
metrics['Train_MAE'].append(train_mae)
metrics['Train_R2'].append(train_r2)
# Predict on the test set
y_pred = model.predict(X_test)
# Calculate performance metrics on test data
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
metrics['Test_RMSE'].append(rmse)
metrics['Test_MAE'].append(mae)
metrics['Test_R2'].append(r2)
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Hello everyone, I am using walkforward validation technique in RF to identify main drivers of GWL fluctuation. My problem is I'm getting the R2 value for testing to be negative number which probably indicates very poor performance. I understand the walkforward validation is a good technique for time series data. I do not know what I am doing wrong, or this validation method is not just fitting well for the data. I am using one year data(365) to train and 100 for test, I first tried 30 days data for testing. See my code below may be I'm doing something wrong. I will apreciate your feedback.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
Load the dataset
data = LTF15_dff.copy()
data.set_index('Date', inplace=True)
Normalize the features
scaler = StandardScaler()
features = ['SM', 'ST', 'AT', 'SWE', 'P', 'LLTC_SWL', 'ULTC_SWL']
target = 'H'
data[features] = scaler.fit_transform(data[features])
Walkforward validation setup
train_window = 365 # 1 year of training data
test_window = 100 # 30 days of testing
metrics = {'Train_RMSE': [], 'Train_MAE': [], 'Train_R2': [], 'Test_RMSE': [], 'Test_MAE': [], 'Test_R2': []}
feature_importances = []
Manual hyperparameter configuration
manual_params = {
'n_estimators': 200,
'max_depth': 10,
'min_samples_split': 5,
'min_samples_leaf': 2,
}
start = 0
end = len(data) - train_window - test_window
Expanding-window walkforward validation
for start in range(end):
# Define training and testing data
train = data.iloc[:start + train_window]
test = data.iloc[start + train_window:start + train_window + test_window]
Summarize metrics
Train_average_rmse = np.mean(metrics['Train_RMSE'])
Train_average_mae = np.mean(metrics['Train_MAE'])
Train_average_r2 = np.mean(metrics['Train_R2'])
print(f"Train ave. RMSE: {Train_average_rmse:.3f}")
print(f"Train ave. MAE: {Train_average_mae:.3f}")
print(f"Train ave. R²: {Train_average_r2:.3f}")
average_rmse = np.mean(metrics['Test_RMSE'])
average_mae = np.mean(metrics['Test_MAE'])
average_r2 = np.mean(metrics['Test_R2'])
print("-----------------------------------")
print(f"Test ave. RMSE: {average_rmse:.3f}")
print(f"Test ave. MAE: {average_mae:.3f}")
print(f"Test ave. R²: {average_r2:.3f}")
Aggregate and analyze feature importance
avg_feature_importances = np.mean(feature_importances, axis=0)
feature_importance_df = pd.DataFrame({
'Feature': features,
'Importance': avg_feature_importances
}).sort_values(by='Importance', ascending=False)
Display overall feature importance
print("\nOverall Feature Importance:")
print(feature_importance_df)
Temporal variation of feature importance
feature_importances_df = pd.DataFrame(feature_importances, columns=features)
feature_importances_df.index = data.index[train_window]
#pd.date_range(start=data.index[train_window], periods=len(feature_importances), freq='D')
Plot temporal feature importance
plt.figure(figsize=(12, 8))
for feature in features:
plt.plot(feature_importances_df[feature], label=feature)
plt.title('Temporal Variation of Feature Importance')
plt.xlabel('Time')
plt.ylabel('Feature Importance')
plt.legend()
plt.show()
Beta Was this translation helpful? Give feedback.
All reactions