Loop around train and test/evaluate for 30 diffent contiguous lookup date valies (1-30). Put results in file CSV.

iMammal · iMammal · commit ca6f4c1d9e44 · 2022-09-07T18:22:35.000-04:00
diff --git a/machine-learning/stock-prediction/evaluate.py b/machine-learning/stock-prediction/evaluate.py
@@ -0,0 +1,148 @@
+import numpy as np
+
+import matplotlib.pyplot as plt
+
+from stock_prediction import create_model, load_data
+from parameters import *
+
+
+def plot_graph(test_df,MY_LOOKUP_STEP):
+    """
+    This function plots true close price along with predicted close price
+    with blue and red colors respectively
+    """
+    plt.plot(test_df[f'true_adjclose_{MY_LOOKUP_STEP}'], c='k')
+    plt.plot(test_df[f'adjclose_{MY_LOOKUP_STEP}'], c='bo')
+    plt.xlabel("Days")
+    plt.ylabel("Price")
+    plt.legend(["Actual Price", "Predicted Price"])
+    plt.show()
+
+
+def get_final_df(model, data, MY_LOOKUP_STEP):
+    """
+    This function takes the `model` and `data` dict to
+    construct a final dataframe that includes the features along
+    with true and predicted prices of the testing dataset
+    """
+    # if predicted future price is higher than the current,
+    # then calculate the true future price minus the current price, to get the buy profit
+    buy_profit  = lambda current, pred_future, true_future: true_future - current if pred_future > current else 0
+    # if the predicted future price is lower than the current price,
+    # then subtract the true future price from the current price
+    sell_profit = lambda current, pred_future, true_future: current - true_future if pred_future < current else 0
+    X_test = data["X_test"]
+    y_test = data["y_test"]
+    # perform prediction and get prices
+    y_pred = model.predict(X_test)
+    if SCALE:
+        y_test = np.squeeze(data["column_scaler"]["adjclose"].inverse_transform(np.expand_dims(y_test, axis=0)))
+        y_pred = np.squeeze(data["column_scaler"]["adjclose"].inverse_transform(y_pred))
+    test_df = data["test_df"]
+    # add predicted future prices to the dataframe
+    test_df[f"adjclose_{MY_LOOKUP_STEP}"] = y_pred
+    # add true future prices to the dataframe
+    test_df[f"true_adjclose_{MY_LOOKUP_STEP}"] = y_test
+    # sort the dataframe by date
+    test_df.sort_index(inplace=True)
+    final_df = test_df
+    # add the buy profit column
+    final_df["buy_profit"] = list(map(buy_profit,
+                                    final_df["adjclose"],
+                                    final_df[f"adjclose_{MY_LOOKUP_STEP}"],
+                                    final_df[f"true_adjclose_{MY_LOOKUP_STEP}"])
+                                    # since we don't have profit for last sequence, add 0's
+                                    )
+    # add the sell profit column
+    final_df["sell_profit"] = list(map(sell_profit,
+                                    final_df["adjclose"],
+                                    final_df[f"adjclose_{MY_LOOKUP_STEP}"],
+                                    final_df[f"true_adjclose_{MY_LOOKUP_STEP}"])
+                                    # since we don't have profit for last sequence, add 0's
+                                    )
+    return final_df
+
+
+def predict(model, data):
+    # retrieve the last sequence from data
+    last_sequence = data["last_sequence"][-N_STEPS:]
+    # expand dimension
+    last_sequence = np.expand_dims(last_sequence, axis=0)
+    # get the prediction (scaled from 0 to 1)
+    prediction = model.predict(last_sequence)
+    # get the price (by inverting the scaling)
+    if SCALE:
+        predicted_price = data["column_scaler"]["adjclose"].inverse_transform(prediction)[0][0]
+    else:
+        predicted_price = prediction[0][0]
+    return predicted_price
+
+predictions = []
+# predictions = np.array(float)
+# predictions.resize(30)
+for MY_LOOKUP_STEP in range(1, 30):
+
+    # load the data
+    data = load_data(ticker, N_STEPS, scale=SCALE, split_by_date=SPLIT_BY_DATE,
+                    shuffle=SHUFFLE, lookup_step=MY_LOOKUP_STEP, test_size=TEST_SIZE,
+                    feature_columns=FEATURE_COLUMNS)
+
+    # construct the model
+    model = create_model(N_STEPS, len(FEATURE_COLUMNS), loss=LOSS, units=UNITS, cell=CELL, n_layers=N_LAYERS,
+                        dropout=DROPOUT, optimizer=OPTIMIZER, bidirectional=BIDIRECTIONAL)
+
+    my_model_name = f"{date_now}_{ticker}-{shuffle_str}-{scale_str}-{split_by_date_str}-\
+{LOSS}-{OPTIMIZER}-{CELL.__name__}-seq-{N_STEPS}-step-{MY_LOOKUP_STEP}-layers-{N_LAYERS}-units-{UNITS}"
+    if BIDIRECTIONAL:
+        my_model_name += "-b"
+
+    # load optimal model weights from results folder
+    model_path = os.path.join("results", my_model_name) + ".h5"
+    model.load_weights(model_path)
+
+    # evaluate the model
+    loss, mae = model.evaluate(data["X_test"], data["y_test"], verbose=0)
+    # calculate the mean absolute error (inverse scaling)
+    if SCALE:
+        mean_absolute_error = data["column_scaler"]["adjclose"].inverse_transform([[mae]])[0][0]
+    else:
+        mean_absolute_error = mae
+
+    # get the final dataframe for the testing set
+    final_df = get_final_df(model, data, MY_LOOKUP_STEP)
+    # predict the future price
+    future_price = predict(model, data)
+    # we calculate the accuracy by counting the number of positive profits
+    accuracy_score = (len(final_df[final_df['sell_profit'] > 0]) + len(final_df[final_df['buy_profit'] > 0])) / len(final_df)
+    # calculating total buy & sell profit
+    total_buy_profit  = final_df["buy_profit"].sum()
+    total_sell_profit = final_df["sell_profit"].sum()
+    # total profit by adding sell & buy together
+    total_profit = total_buy_profit + total_sell_profit
+    # dividing total profit by number of testing samples (number of trades)
+    profit_per_trade = total_profit / len(final_df)
+    # Append new prediction to array
+    # predictions[MY_LOOKUP_STEP] = future_price
+    predictions.append(future_price)
+    # printing metrics
+    print(f"Future price after {MY_LOOKUP_STEP} days is {future_price:.2f}$")
+    print(f"{LOSS} loss:", loss)
+    print("Mean Absolute Error:", mean_absolute_error)
+    print("Accuracy score:", accuracy_score)
+    print("Total buy profit:", total_buy_profit)
+    print("Total sell profit:", total_sell_profit)
+    print("Total profit:", total_profit)
+    print("Profit per trade:", profit_per_trade)
+    # plot true/pred prices graph
+    plot_graph(final_df,MY_LOOKUP_STEP)
+    print(final_df.tail(10))
+    print(f"Future price after {MY_LOOKUP_STEP} days is {future_price:.2f}$")
+    # save the final dataframe to csv-results folder
+    csv_results_folder = "csv-results"
+    if not os.path.isdir(csv_results_folder):
+        os.mkdir(csv_results_folder)
+    csv_filename = os.path.join(csv_results_folder, my_model_name + ".csv")
+    final_df.to_csv(csv_filename)
+print(predictions)
+predictions = np.array(predictions)
+predictions.tofile(f"{csv_results_folder}\evaluate30_{date_now}_{ticker}.csv",",")
diff --git a/machine-learning/stock-prediction/parameters.py b/machine-learning/stock-prediction/parameters.py
@@ -5,10 +5,10 @@
 # Window size or the sequence length
 N_STEPS = 50
 # Lookup step, 1 is the next day
-LOOKUP_STEP = 15
+LOOKUP_STEP = 1  #15
 
 # whether to scale feature columns & output price as well
-SCALE = True
+SCALE = True #False #True
 scale_str = f"sc-{int(SCALE)}"
 # whether to shuffle the dataset
 SHUFFLE = True
@@ -46,7 +46,9 @@
 EPOCHS = 500
 
 # Amazon stock market
-ticker = "AMZN"
+# ticker = "AMZN"
+# Oxford Nanopore stock market
+ticker = "ONTTF"
 ticker_data_filename = os.path.join("data", f"{ticker}_{date_now}.csv")
 # model name to save, making it as unique as possible based on parameters
 model_name = f"{date_now}_{ticker}-{shuffle_str}-{scale_str}-{split_by_date_str}-\
diff --git a/machine-learning/stock-prediction/train30.py b/machine-learning/stock-prediction/train30.py
@@ -0,0 +1,48 @@
+from stock_prediction import create_model, load_data
+from tensorflow.keras.layers import LSTM
+from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
+import os
+import pandas as pd
+from parameters import *
+
+
+# create these folders if they does not exist
+if not os.path.isdir("results"):
+    os.mkdir("results")
+
+if not os.path.isdir("logs"):
+    os.mkdir("logs")
+
+if not os.path.isdir("data"):
+    os.mkdir("data")
+
+for MY_LOOKUP_STEP in range(1,30):
+
+    # load the data
+    data = load_data(ticker, N_STEPS, scale=SCALE, split_by_date=SPLIT_BY_DATE,
+                    shuffle=SHUFFLE, lookup_step=MY_LOOKUP_STEP, test_size=TEST_SIZE,
+                    feature_columns=FEATURE_COLUMNS)
+
+    # save the dataframe
+    data["df"].to_csv(ticker_data_filename)
+
+    # construct the model
+    model = create_model(N_STEPS, len(FEATURE_COLUMNS), loss=LOSS, units=UNITS, cell=CELL, n_layers=N_LAYERS,
+                        dropout=DROPOUT, optimizer=OPTIMIZER, bidirectional=BIDIRECTIONAL)
+
+    my_model_name = f"{date_now}_{ticker}-{shuffle_str}-{scale_str}-{split_by_date_str}-\
+{LOSS}-{OPTIMIZER}-{CELL.__name__}-seq-{N_STEPS}-step-{MY_LOOKUP_STEP}-layers-{N_LAYERS}-units-{UNITS}"
+    if BIDIRECTIONAL:
+        my_model_name += "-b"
+
+    # some tensorflow callbacks
+    checkpointer = ModelCheckpoint(os.path.join("results", my_model_name + ".h5"), save_weights_only=True, save_best_only=True, verbose=1)
+    tensorboard = TensorBoard(log_dir=os.path.join("logs", my_model_name))
+    # train the model and save the weights whenever we see
+    # a new optimal model using ModelCheckpoint
+    history = model.fit(data["X_train"], data["y_train"],
+                        batch_size=BATCH_SIZE,
+                        epochs=EPOCHS,
+                        validation_data=(data["X_test"], data["y_test"]),
+                        callbacks=[checkpointer, tensorboard],
+                        verbose=1)