diff --git a/.gitignore b/.gitignore
index 58391e9..29fb4f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,9 @@ secrets/
 **/__pycache__/
 codecompasslib/API/datasets/**.csv
 codecompasslib/API/**.txt
+codecompasslib/embeddings/**.csv
 dataset_new.csv
 codecompasslib/models/**.csv
 codecompasslib/models/examples/**.csv
-codecompasslib/PretrainedModels/
\ No newline at end of file
+codecompasslib/PretrainedModels/
+**.csv
\ No newline at end of file
diff --git a/codecompasslib/API/helper_functions.py b/codecompasslib/API/helper_functions.py
index 4b04da0..dfab9c7 100644
--- a/codecompasslib/API/helper_functions.py
+++ b/codecompasslib/API/helper_functions.py
@@ -1,3 +1,5 @@
+import pickle
+import os
 from json import load
 from pandas import DataFrame
 from os.path import dirname
@@ -17,6 +19,33 @@ def save_to_csv(data: any, filename: str) -> None:
     df: DataFrame = DataFrame(data)
     df.to_csv(Path(PARENT_PATH + '/Data/' + filename), index=False)
 
+def save_cache(cache_data: dict, cache_filename: str):
+    """
+    Save a dictionary to a file in pickle format.
+    
+    :param cache_data: The dictionary to be saved.
+    :param cache_filename: The name of the file where the cache will be saved.
+    """
+    with open(cache_filename, 'wb') as cache_file:
+        pickle.dump(cache_data, cache_file)
+    print(f"Cache saved to {cache_filename}")
+
+def load_cache(cache_filename: str) -> dict:
+    """
+    Load a dictionary from a pickle file.
+    
+    :param cache_filename: The name of the file where the cache is stored.
+    :return: The loaded dictionary.
+    """
+    if os.path.exists(cache_filename):
+        with open(cache_filename, 'rb') as cache_file:
+            cache_data = pickle.load(cache_file)
+        print(f"Cache loaded from {cache_filename}")
+        return cache_data
+    else:
+        print(f"No cache found at {cache_filename}")
+        return {}
+
 
 def list_to_txt(data: list, file_name: str) -> bool:
     """
diff --git a/codecompasslib/embeddings/embeddings_helper_functions.py b/codecompasslib/embeddings/embeddings_helper_functions.py
index 8a13a2f..3471f3f 100644
--- a/codecompasslib/embeddings/embeddings_helper_functions.py
+++ b/codecompasslib/embeddings/embeddings_helper_functions.py
@@ -1,3 +1,14 @@
+import sys
+import os
+
+# Construct the path to the root directory (one level up from embeddings)
+root_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(root_dir)
+real_project_dir = os.path.dirname(project_dir)
+
+# Add the project directory to the Python path
+sys.path.insert(0, real_project_dir)
+
 import numpy as np
 import pandas as pd
 from gensim.models.keyedvectors import KeyedVectors
@@ -35,7 +46,7 @@ def load_word2vec_model():
     Citation:
         Efstathiou Vasiliki, Chatzilenas Christos, & Spinellis Diomidis. (2018). Word Embeddings for the Software Engineering Domain [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1199620
     """
-    word_vect = KeyedVectors.load_word2vec_format("./codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True)
+    word_vect = KeyedVectors.load_word2vec_format("codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True)
     return word_vect
 
 # Vectorizing text using domain specific word2vec model
diff --git a/codecompasslib/embeddings/generate_embedded_dataset.py b/codecompasslib/embeddings/generate_embedded_dataset.py
index aaa05a7..ce8bd9a 100644
--- a/codecompasslib/embeddings/generate_embedded_dataset.py
+++ b/codecompasslib/embeddings/generate_embedded_dataset.py
@@ -90,13 +90,13 @@ def generate_openAI_embedded_csv(df, column_to_embed):
         
         # Save the current batch DataFrame to a CSV file
         # Mode 'a' is for append, header=False to avoid writing headers multiple times
-        batch_df.to_csv('df_embedded_0504_batch.csv', mode='a', header=not i, index=False)
+        batch_df.to_csv('df_embedded_1208_batch.csv', mode='a', header=not i, index=False)
         
         # Optional: Free up memory by deleting the batch DataFrame if no longer needed
         del batch_df
     
     # Load the CSV file with the embeddings
-    df_with_embeddings = pd.read_csv('df_embedded_0504_batch.csv')
+    df_with_embeddings = pd.read_csv('df_embedded_1208_batch.csv')
     return df_with_embeddings
     
 def main():
diff --git a/codecompasslib/models/cosine_similarity_model.py b/codecompasslib/models/cosine_similarity_model.py
index 9dafd97..e06d71c 100644
--- a/codecompasslib/models/cosine_similarity_model.py
+++ b/codecompasslib/models/cosine_similarity_model.py
@@ -39,6 +39,7 @@ def load_data(full_data_folder_id: str) -> DataFrame:
     """
     creds = get_creds_drive()
     df: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
+
     return df
 
 def clean_data(df: DataFrame) -> DataFrame:
diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb
new file mode 100644
index 0000000..9619501
--- /dev/null
+++ b/codecompasslib/models/examples/knn_model.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import pandas as pd\n",
+    "from typing import Tuple\n",
+    "from pandas import DataFrame\n",
+    "import numpy as np\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
+    "\n",
+    "sys.path.append('../../../')\n",
+    "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n",
+    "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Word2Vec\n",
+    "Word2Vec is a method that converts words into numerical vectors, capturing information about their meaning based on the context in which they appear.\n",
+    "\n",
+    "Here’s how it works:\n",
+    "\n",
+    "Initialize a vector for each word randomly.\n",
+    "For each word in the corpus:\n",
+    "Predict the context words (words nearby) given the target word (skip-gram).\n",
+    "Adjust the word vectors to minimize the prediction error.\n",
+    "The learned vectors represent the words’ meanings. These vectors can be used for tasks like document similarity, text classification, and information retrieval.\n",
+    "\n",
+    "In our project we use pre-trained word2vec model, specifically trained on software engineering domain."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### K-Nearest Neighbors (KNN)\n",
+    "KNN is a supervised learning algorithm primarily used for classification based on the similarity of data points. It assumes that similar things tend to be close to each other in the feature space.\n",
+    "\n",
+    "Distance Metric: To measure similarity, we compute the distance between data points. Commonly used metrics include Euclidean distance, Manhattan distance, or cosine similarity.\n",
+    "Prediction: Given a new data point, find its K nearest neighbors based on the chosen distance metric.\n",
+    "In our project we use KNN to find users most similar to our target user."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Download 11%.\n",
+      "\n",
+      "Download 23%.\n",
+      "\n",
+      "Download 35%.\n",
+      "\n",
+      "Download 47%.\n",
+      "\n",
+      "Download 59%.\n",
+      "\n",
+      "Download 71%.\n",
+      "\n",
+      "Download 83%.\n",
+      "\n",
+      "Download 95%.\n",
+      "\n",
+      "Download 100%.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\API\\drive_operations.py:88: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  return read_csv(fh)\n"
+     ]
+    }
+   ],
+   "source": [
+    "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
+    "\n",
+    "df_non_embedded = download_csv_as_pd_dataframe(creds=get_creds_drive(), file_id=full_data_folder_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "usecols=['owner_user', 'name', 'description', 'language']\n",
+    "# drop every column except for these\n",
+    "df = df_non_embedded.copy()\n",
+    "df = df[usecols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>owner_user</th>\n",
+       "      <th>name</th>\n",
+       "      <th>description</th>\n",
+       "      <th>language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>Automation_Project</td>\n",
+       "      <td>Automated Bash Script to automate log Backup g...</td>\n",
+       "      <td>Shell</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>IKON</td>\n",
+       "      <td>CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...</td>\n",
+       "      <td>Python</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>javamavensonarrgohelmk8</td>\n",
+       "      <td>No description</td>\n",
+       "      <td>HTML</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>node001</td>\n",
+       "      <td>files_repo</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>nodeandjs</td>\n",
+       "      <td>node java script application</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583820</th>\n",
+       "      <td>pinax</td>\n",
+       "      <td>pinax-blog</td>\n",
+       "      <td>a blog app for Django</td>\n",
+       "      <td>Python</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583821</th>\n",
+       "      <td>montylounge</td>\n",
+       "      <td>django-mingus</td>\n",
+       "      <td>a Django blog engine leveraging reusable apps ...</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583822</th>\n",
+       "      <td>WuXianglong</td>\n",
+       "      <td>GeekBlog</td>\n",
+       "      <td>A full blog system based on Django</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583823</th>\n",
+       "      <td>NARKOZ</td>\n",
+       "      <td>hacker-scripts</td>\n",
+       "      <td>Based on a true story</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583824</th>\n",
+       "      <td>matthewbdaly</td>\n",
+       "      <td>django_tutorial_blog_ng</td>\n",
+       "      <td>The source for the new version of my Django tu...</td>\n",
+       "      <td>Python</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1524223 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            owner_user                     name  \\\n",
+       "4        Rameshwar0852       Automation_Project   \n",
+       "18       Rameshwar0852                     IKON   \n",
+       "19       Rameshwar0852  javamavensonarrgohelmk8   \n",
+       "23       Rameshwar0852                  node001   \n",
+       "24       Rameshwar0852                nodeandjs   \n",
+       "...                ...                      ...   \n",
+       "2583820          pinax               pinax-blog   \n",
+       "2583821    montylounge            django-mingus   \n",
+       "2583822    WuXianglong                 GeekBlog   \n",
+       "2583823         NARKOZ           hacker-scripts   \n",
+       "2583824   matthewbdaly  django_tutorial_blog_ng   \n",
+       "\n",
+       "                                               description    language  \n",
+       "4        Automated Bash Script to automate log Backup g...       Shell  \n",
+       "18       CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...      Python  \n",
+       "19                                          No description        HTML  \n",
+       "23                                              files_repo  JavaScript  \n",
+       "24                           node java script application   JavaScript  \n",
+       "...                                                    ...         ...  \n",
+       "2583820                              a blog app for Django      Python  \n",
+       "2583821  a Django blog engine leveraging reusable apps ...  JavaScript  \n",
+       "2583822                 A full blog system based on Django  JavaScript  \n",
+       "2583823                              Based on a true story  JavaScript  \n",
+       "2583824  The source for the new version of my Django tu...      Python  \n",
+       "\n",
+       "[1524223 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dropna()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert language column to string type\n",
+    "df['language'] = df['language'].astype(str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File not found.\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'NoneType' object has no attribute 'vector_size'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[54], line 39\u001b[0m\n\u001b[0;32m     35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43membedded_user_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mname\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m     41\u001b[0m embedded_user_df\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\series.py:4908\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m   4780\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m   4781\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   4782\u001b[0m     func: AggFuncType,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   4787\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m   4788\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m   4789\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   4790\u001b[0m \u001b[38;5;124;03m    Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m   4791\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   4906\u001b[0m \u001b[38;5;124;03m    dtype: float64\u001b[39;00m\n\u001b[0;32m   4907\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 4908\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   4909\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4910\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4911\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4912\u001b[0m \u001b[43m        \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4913\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4914\u001b[0m \u001b[43m        \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4915\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1424\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m   1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m   1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m   1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m   1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m   1505\u001b[0m \u001b[38;5;66;03m#  Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m   1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1508\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m   1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m   1512\u001b[0m     \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m   1513\u001b[0m     \u001b[38;5;66;03m#  See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m   1514\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m    918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m    919\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m   1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m   1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   1745\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m   1746\u001b[0m         values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m   1747\u001b[0m     )\n",
+      "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n",
+      "Cell \u001b[1;32mIn[54], line 39\u001b[0m, in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m     35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m     40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m     41\u001b[0m embedded_user_df\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\models\\model_diff_repos.py:60\u001b[0m, in \u001b[0;36mvectorize_text\u001b[1;34m(text, word_vect)\u001b[0m\n\u001b[0;32m     58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvectorize_text\u001b[39m(text, word_vect):\n\u001b[0;32m     59\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m         vector_sum \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(\u001b[43mword_vect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector_size\u001b[49m)  \u001b[38;5;66;03m# Initialize an array to store the sum of word vectors\u001b[39;00m\n\u001b[0;32m     61\u001b[0m         count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m  \u001b[38;5;66;03m# Initialize a count to keep track of the number of words found in the vocabulary\u001b[39;00m\n\u001b[0;32m     62\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit():\n",
+      "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'vector_size'"
+     ]
+    }
+   ],
+   "source": [
+    "# Create list of unique languages with _ prefix\n",
+    "languages = ['_' + language for language in df['language'].unique()]\n",
+    "\n",
+    "# one hot encode the languages and don't include the language prefix\n",
+    "df = pd.get_dummies(df, columns=['language'], prefix='')\n",
+    "\n",
+    "# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo\n",
+    "\n",
+    "# Create a dictionary for aggregation\n",
+    "aggregation_dict = {\n",
+    "    'name': lambda x: list(x),\n",
+    "    'description': lambda x: list(x)\n",
+    "}\n",
+    "\n",
+    "# Add columns for languages\n",
+    "for lang in languages:\n",
+    "    aggregation_dict[lang] = 'max'\n",
+    "\n",
+    "# Group by 'owner_user' and aggregate\n",
+    "user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()\n",
+    "\n",
+    "# Display the first few rows of the resulting DataFrame\n",
+    "user_df.head()\n",
+    "\n",
+    "# first we turn list of names and descriptions into a single string\n",
+    "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
+    "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
+    "user_df.head()\n",
+    "\n",
+    "word_vect = load_word2vec_model\n",
+    "\n",
+    "# Text preprocessing\n",
+    "embedded_user_df = user_df.copy()\n",
+    "embedded_user_df['name'] = user_df['name'].fillna('')  \n",
+    "embedded_user_df['description'] = user_df['description'].fillna('')\n",
+    "\n",
+    "\n",
+    "word2vec_model = load_word2vec_model()\n",
+    "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(lambda x: vectorize_text(x, word2vec_model))\n",
+    "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(lambda x: vectorize_text(x, word2vec_model))\n",
+    "embedded_user_df\n",
+    "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Transform df into something that KNN can use. To be more specific, into a feature matrix\n",
+    "# Create a list of all the vectors\n",
+    "vectors = []\n",
+    "repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1\n",
+    "\n",
+    "for row in repo_df.index: \n",
+    "    vector = []\n",
+    "    for columns in ['name_vector', 'description_vector']:\n",
+    "        if type(repo_df.at[row, columns]) == np.ndarray:\n",
+    "            for element in repo_df.at[row, columns]:\n",
+    "                vector.append(element)\n",
+    "        else: vector.append(repo_df.at[row, columns])\n",
+    "    vectors.append(vector)\n",
+    "\n",
+    "    # Train Nearest Neighbors Model\n",
+    "k = 5  # Number of neighbors to find\n",
+    "nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')\n",
+    "nn_model.fit(vectors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example Usage\n",
+    "\n",
+    "target_user = 21\n",
+    "# neighbors excluding the target user\n",
+    "neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]\n",
+    "neighbors"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py
index 9d21174..7d64b7f 100644
--- a/codecompasslib/models/lightgbm_model.py
+++ b/codecompasslib/models/lightgbm_model.py
@@ -19,7 +19,7 @@
 
 from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive
 from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos
-
+from codecompasslib.API.helper_functions import save_cache, load_cache
 
 def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]:
     """
@@ -38,7 +38,6 @@ def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tupl
     del df[label_col]
     return df, y
 
-
 def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Booster, ordinal.OrdinalEncoder]:
     """
     Trains a LightGBM model using the provided merged dataframe.
@@ -128,13 +127,13 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu
     :return: The non-embedded and embedded datasets
     """
 
-    creds = get_creds_drive()
-    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
-    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
+    # creds = get_creds_drive()
+    # df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
+    # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
 
     # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data
-    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
-    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')
+    df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
+    df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')
 
     print("Data loaded")
     return df_non_embedded, df_embedded
@@ -166,12 +165,13 @@ def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame,
     # Add target column: 1 if the repo is starred or owned by the user, else 0
     owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]]
     starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]]
+    print("Owned length: ", len(owned_by_target_repo_ids))
+    print("Starred length: ", len(starred_repo_ids))
     starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids
     df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0)
 
     return df_merged, starred_or_owned_by_user
 
-
 def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFrame,
                                       df_embedded: DataFrame, number_of_recommendations: int = 10) -> list:
     """
@@ -217,5 +217,7 @@ def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFra
         else:
             counter += 1
             recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index]))
-
+    cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl')
+    cached_recommendations[target_user] = recommendations
+    save_cache(cached_recommendations, 'codecompasslib/recommendations_cache.pkl')
     return recommendations
diff --git a/codecompasslib/recommendations_cache.pkl b/codecompasslib/recommendations_cache.pkl
new file mode 100644
index 0000000..ef41e7b
Binary files /dev/null and b/codecompasslib/recommendations_cache.pkl differ
diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py
index 68b8cfc..754994f 100644
--- a/frontend/recommender/app.py
+++ b/frontend/recommender/app.py
@@ -13,6 +13,7 @@
 
 # Import necessary functions from codecompasslib
 from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data
+from codecompasslib.API.helper_functions import load_cache
 
 # Function to load cached data
 def load_cached_data():
@@ -24,6 +25,7 @@ def load_cached_data():
             full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'
             st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id)
     return st.session_state.cached_data
+    
 
 def main():
     # Load the data
@@ -41,9 +43,13 @@ def main():
         if target_user not in df_embedded['owner_user'].values:
             st.error("User not found in the dataset. Please enter a valid username.")
         else:
-            # Generate recommendations
-            with st.spinner('Generating recommendations...'):
-                recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)
+            cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl')
+            if target_user in cached_recommendations.keys():
+                recommendations = cached_recommendations[target_user]
+            else:
+                # Generate recommendations
+                with st.spinner('Generating recommendations...'):
+                    recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)
             
             # Display recommendations
             st.subheader("Recommendations")