From 8ca5f04967d72760268f44581ee92f3fd0b866c4 Mon Sep 17 00:00:00 2001
From: mirandadrummond <mdrummond.ieu2021@student.ie.edu>
Date: Sun, 7 Apr 2024 23:21:03 +0200
Subject: [PATCH 1/6] adding cosine and KNN as ipynb

---
 .../models/examples/cosine_similarity.ipynb   | 254 ++++++++++++++++++
 .../models/examples/knn_model.ipynb           | 230 ++++++++++++++++
 2 files changed, 484 insertions(+)
 create mode 100644 codecompasslib/models/examples/cosine_similarity.ipynb
 create mode 100644 codecompasslib/models/examples/knn_model.ipynb

diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb
new file mode 100644
index 0000000..9da1581
--- /dev/null
+++ b/codecompasslib/models/examples/cosine_similarity.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'codecompasslib'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import pandas as pd\n",
+    "from typing import Tuple\n",
+    "from pandas import DataFrame\n",
+    "import numpy as np\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "\n",
+    "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Load the data from the Google Drive\n",
+    "    :return: The non-embedded and embedded datasets\n",
+    "    \"\"\"\n",
+    "    DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n",
+    "    DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n",
+    "\n",
+    "    creds = get_creds_drive()\n",
+    "    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n",
+    "    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n",
+    "\n",
+    "    # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n",
+    "    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n",
+    "    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n",
+    "\n",
+    "    print(\"Data loaded\")\n",
+    "    return df_non_embedded, df_embedded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
+    "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n",
+    "\n",
+    "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_clean_data(df_non_embedded):\n",
+    "    \"\"\"\n",
+    "    Load and clean the dataset from a specified filepath.\n",
+    "    \n",
+    "    Args:\n",
+    "        filepath (str): The file path to the dataset.\n",
+    "\n",
+    "    Returns:\n",
+    "        pandas.DataFrame: The cleaned DataFrame.\n",
+    "    \"\"\"\n",
+    "    # Load the data\n",
+    "    df = df_non_embedded\n",
+    "\n",
+    "    # Delete missing values\n",
+    "    df.dropna(inplace=True)\n",
+    "\n",
+    "    # Delete columns that are not needed\n",
+    "    columns_to_drop = [\n",
+    "        'is_archived', 'is_disabled', 'is_template', 'has_projects',  \n",
+    "        'owner_type', 'has_pages', 'has_wiki', \n",
+    "        'has_issues', 'has_downloads', 'is_fork'\n",
+    "    ]\n",
+    "    df.drop(columns=columns_to_drop, inplace=True)\n",
+    "\n",
+    "    # Handling missing values in text columns\n",
+    "    df['description'].fillna('', inplace=True)\n",
+    "    df['name'].fillna('', inplace=True)\n",
+    "    df['language'].fillna('', inplace=True)\n",
+    "\n",
+    "    # Drop duplicates with name\n",
+    "    df.drop_duplicates(subset='name', keep='first', inplace=True)\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_cosine_similarity_scores(df):\n",
+    "    \"\"\"\n",
+    "    Calculate cosine similarity scores for the dataset.\n",
+    "\n",
+    "    Args:\n",
+    "        df (pandas.DataFrame): The DataFrame containing repository data.\n",
+    "\n",
+    "    Returns:\n",
+    "        tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n",
+    "    \"\"\"\n",
+    "    # Concatenating the text columns for vectorization\n",
+    "    text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n",
+    "\n",
+    "    # Vectorizing the text data using TF-IDF\n",
+    "    tfidf_vectorizer = TfidfVectorizer()\n",
+    "    tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n",
+    "\n",
+    "    # Calculating cosine similarity\n",
+    "    cosine_sim = cosine_similarity(tfidf_matrix)\n",
+    "\n",
+    "    # Average the cosine similarities for each repo\n",
+    "    similarity_scores = np.mean(cosine_sim, axis=1)\n",
+    "\n",
+    "    # Adding the new column to the dataset\n",
+    "    df['cosine_similarity_score'] = similarity_scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_cosine_similarity_scores(df):\n",
+    "    \"\"\"\n",
+    "    Calculate cosine similarity scores for the dataset.\n",
+    "\n",
+    "    Args:\n",
+    "        df (pandas.DataFrame): The DataFrame containing repository data.\n",
+    "\n",
+    "    Returns:\n",
+    "        tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n",
+    "    \"\"\"\n",
+    "    # Concatenating the text columns for vectorization\n",
+    "    text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n",
+    "\n",
+    "    # Vectorizing the text data using TF-IDF\n",
+    "    tfidf_vectorizer = TfidfVectorizer()\n",
+    "    tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n",
+    "\n",
+    "    # Calculating cosine similarity\n",
+    "    cosine_sim = cosine_similarity(tfidf_matrix)\n",
+    "\n",
+    "    # Average the cosine similarities for each repo\n",
+    "    similarity_scores = np.mean(cosine_sim, axis=1)\n",
+    "\n",
+    "    # Adding the new column to the dataset\n",
+    "    df['cosine_similarity_score'] = similarity_scores\n",
+    "\n",
+    "    return df, tfidf_vectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10):\n",
+    "    \"\"\"\n",
+    "    Recommend repositories based on user preferences.\n",
+    "\n",
+    "    Args:\n",
+    "        user_preference (str): The user's preferred keywords or phrases.\n",
+    "        df (pandas.DataFrame): The DataFrame containing repository data.\n",
+    "        tfidf_vectorizer (TfidfVectorizer): The TF-IDF vectorizer used for transforming text data.\n",
+    "        top_n (int, optional): Number of top recommendations to return. Defaults to 10.\n",
+    "\n",
+    "    Returns:\n",
+    "        pandas.DataFrame: DataFrame containing top_n recommended repositories.\n",
+    "    \"\"\"\n",
+    "    # Vectorize the user preference\n",
+    "    user_pref_vector = tfidf_vectorizer.transform([user_preference])\n",
+    "\n",
+    "    # Calculate cosine similarity with all repositories\n",
+    "    cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(df['name'] + \" \" + df['description'] + \" \" + df['language'])).flatten()\n",
+    "\n",
+    "    # Get the indices of the repositories with the highest similarity scores\n",
+    "    top_indices = np.argsort(cosine_scores)[-top_n:][::-1]\n",
+    "\n",
+    "    # Select the top n recommended repositories\n",
+    "    recommended_repos = df.iloc[top_indices].reset_index(drop=True)\n",
+    "\n",
+    "    return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main(df):\n",
+    "    \"\"\"\n",
+    "    Main function to run the script.\n",
+    "    \"\"\"\n",
+    "    df, tfidf_vectorizer = calculate_cosine_similarity_scores(df)\n",
+    "    user_preference = \"python\"\n",
+    "    recommended_repos = recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10)\n",
+    "    print(recommended_repos)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb
new file mode 100644
index 0000000..5d7c688
--- /dev/null
+++ b/codecompasslib/models/examples/knn_model.ipynb
@@ -0,0 +1,230 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'codecompasslib'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mneighbors\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NearestNeighbors\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n\u001b[1;32m     13\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_word2vec_model\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import pandas as pd\n",
+    "from typing import Tuple\n",
+    "from pandas import DataFrame\n",
+    "import numpy as np\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from sklearn.neighbors import NearestNeighbors\n",
+    "\n",
+    "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n",
+    "sys.path.append('../../')\n",
+    "from codecompasslib.models.embeddings import load_word2vec_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Load the data from the Google Drive\n",
+    "    :return: The non-embedded and embedded datasets\n",
+    "    \"\"\"\n",
+    "    DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n",
+    "    DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n",
+    "\n",
+    "    creds = get_creds_drive()\n",
+    "    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n",
+    "    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n",
+    "\n",
+    "    # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n",
+    "    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n",
+    "    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n",
+    "\n",
+    "    print(\"Data loaded\")\n",
+    "    return df_non_embedded, df_embedded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
+    "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n",
+    "\n",
+    "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_clean_data(df_non_embedded):\n",
+    "    \"\"\"\n",
+    "    Load and clean the dataset from a specified filepath.\n",
+    "    \n",
+    "    Args:\n",
+    "        filepath (str): The file path to the dataset.\n",
+    "\n",
+    "    Returns:\n",
+    "        pandas.DataFrame: The cleaned DataFrame.\n",
+    "    \"\"\"\n",
+    "    # Load the data\n",
+    "    df = df_non_embedded\n",
+    "\n",
+    "    # Delete missing values\n",
+    "    df.dropna(inplace=True)\n",
+    "\n",
+    "    # Delete columns that are not needed\n",
+    "    columns_to_drop = [\n",
+    "        'is_archived', 'is_disabled', 'is_template', 'has_projects',  \n",
+    "        'owner_type', 'has_pages', 'has_wiki', \n",
+    "        'has_issues', 'has_downloads', 'is_fork'\n",
+    "    ]\n",
+    "    df.drop(columns=columns_to_drop, inplace=True)\n",
+    "\n",
+    "    # Handling missing values in text columns\n",
+    "    df['description'].fillna('', inplace=True)\n",
+    "    df['name'].fillna('', inplace=True)\n",
+    "    df['language'].fillna('', inplace=True)\n",
+    "\n",
+    "    # Drop duplicates with name\n",
+    "    df.drop_duplicates(subset='name', keep='first', inplace=True)\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = load_and_clean_data(df_non_embedded)\n",
+    "\n",
+    "# count unique languges\n",
+    "df['language'].nunique()\n",
+    "\n",
+    "# Create list of unique languages with _ prefix\n",
+    "languages = ['_' + language for language in df['language'].unique()]\n",
+    "\n",
+    "# one hot encode the languages and don't include the language prefix\n",
+    "df = pd.get_dummies(df, columns=['language'], prefix='')\n",
+    "\n",
+    "# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo\n",
+    "\n",
+    "# Create a dictionary for aggregation\n",
+    "aggregation_dict = {\n",
+    "    'name': lambda x: list(x),\n",
+    "    'description': lambda x: list(x)\n",
+    "}\n",
+    "\n",
+    "# Add columns for languages\n",
+    "for lang in languages:\n",
+    "    aggregation_dict[lang] = 'max'\n",
+    "\n",
+    "# Group by 'owner_user' and aggregate\n",
+    "user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()\n",
+    "\n",
+    "# Display the first few rows of the resulting DataFrame\n",
+    "user_df.head()\n",
+    "\n",
+    "# first we turn list of names and descriptions into a single string\n",
+    "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
+    "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
+    "user_df.head()\n",
+    "word_vect = load_word2vec_model\n",
+    "\n",
+    "# Text preprocessing\n",
+    "embedded_user_df = user_df.copy()\n",
+    "embedded_user_df['name'] = user_df['name'].fillna('')  \n",
+    "embedded_user_df['description'] = user_df['description'].fillna('')\n",
+    "\n",
+    "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)\n",
+    "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)\n",
+    "embedded_user_df\n",
+    "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Transform df into something that KNN can use. To be more specific, into a feature matrix\n",
+    "# Create a list of all the vectors\n",
+    "vectors = []\n",
+    "repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1\n",
+    "\n",
+    "for row in repo_df.index: \n",
+    "    vector = []\n",
+    "    for columns in ['name_vector', 'description_vector']:\n",
+    "        if type(repo_df.at[row, columns]) == np.ndarray:\n",
+    "            for element in repo_df.at[row, columns]:\n",
+    "                vector.append(element)\n",
+    "        else: vector.append(repo_df.at[row, columns])\n",
+    "    vectors.append(vector)\n",
+    "\n",
+    "    # Train Nearest Neighbors Model\n",
+    "k = 5  # Number of neighbors to find\n",
+    "nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')\n",
+    "nn_model.fit(vectors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example Usage\n",
+    "\n",
+    "target_user = 21\n",
+    "# neighbors excluding the target user\n",
+    "neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]\n",
+    "neighbors"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 86a3bba780231146b651a2a8b60a874d42972952 Mon Sep 17 00:00:00 2001
From: mirandadrummond <mdrummond.ieu2021@student.ie.edu>
Date: Sun, 7 Apr 2024 23:25:04 +0200
Subject: [PATCH 2/6] clean up

---
 .../models/cosine_similarity_model.py         |  2 +-
 .../models/examples/cosine_similarity.ipynb   | 18 ++------
 .../models/examples/knn_model.ipynb           | 42 ++++++++++---------
 3 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/codecompasslib/models/cosine_similarity_model.py b/codecompasslib/models/cosine_similarity_model.py
index f81187a..0253ab7 100644
--- a/codecompasslib/models/cosine_similarity_model.py
+++ b/codecompasslib/models/cosine_similarity_model.py
@@ -35,7 +35,7 @@ def load_and_clean_data(filepath):
         pandas.DataFrame: The cleaned DataFrame.
     """
     # Load the data
-    df = pd.read_csv(filepath)
+    df = pd.read_csv(filepath) 
 
     # Delete missing values
     df.dropna(inplace=True)
diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb
index 9da1581..55f121a 100644
--- a/codecompasslib/models/examples/cosine_similarity.ipynb
+++ b/codecompasslib/models/examples/cosine_similarity.ipynb
@@ -2,21 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'codecompasslib'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "import sys\n",
@@ -28,7 +16,7 @@
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from sklearn.metrics.pairwise import cosine_similarity\n",
     "\n",
-    "\n",
+    "sys.path.append('../../../')\n",
     "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive"
    ]
   },
diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb
index 5d7c688..196f470 100644
--- a/codecompasslib/models/examples/knn_model.ipynb
+++ b/codecompasslib/models/examples/knn_model.ipynb
@@ -2,21 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'codecompasslib'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mneighbors\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NearestNeighbors\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n\u001b[1;32m     13\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_word2vec_model\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "import sys\n",
@@ -29,14 +17,14 @@
     "from sklearn.metrics.pairwise import cosine_similarity\n",
     "from sklearn.neighbors import NearestNeighbors\n",
     "\n",
+    "sys.path.append('../../../')\n",
     "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n",
-    "sys.path.append('../../')\n",
-    "from codecompasslib.models.embeddings import load_word2vec_model"
+    "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,9 +50,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m full_data_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m      2\u001b[0m full_data_embedded_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m139wi78iRzhwGZwxmI5WALoYocR-Rk9By\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m df_non_embedded, df_embedded \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_data_folder_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfull_data_embedded_folder_id\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[5], line 9\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m(full_data_folder_id, full_data_embedded_folder_id)\u001b[0m\n\u001b[1;32m      6\u001b[0m DRIVE_ID \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0AL1DtB4TdEWdUk9PVA\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      7\u001b[0m DATA_FOLDER \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 9\u001b[0m creds \u001b[38;5;241m=\u001b[39m \u001b[43mget_creds_drive\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m df_non_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_folder_id)\n\u001b[1;32m     11\u001b[0m df_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_embedded_folder_id)\n",
+      "File \u001b[0;32m~/VSCode/CodeCompass/codecompasslib/models/examples/../../../codecompasslib/API/drive_operations.py:24\u001b[0m, in \u001b[0;36mget_creds_drive\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_creds_drive\u001b[39m() \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Credentials:\n\u001b[1;32m     20\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;124;03m    Get the credentials for the Google Drive API\u001b[39;00m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;124;03m    :return: None\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m     creds: Credentials \u001b[38;5;241m=\u001b[39m \u001b[43mCredentials\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_authorized_user_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mOUTER_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/secrets/token.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mSCOPES\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m     26\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m creds \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mexpired \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mrefresh_token:\n",
+      "File \u001b[0;32m~/VSCode/CodeCompass/.venv/lib/python3.11/site-packages/google/oauth2/credentials.py:537\u001b[0m, in \u001b[0;36mCredentials.from_authorized_user_file\u001b[0;34m(cls, filename, scopes)\u001b[0m\n\u001b[1;32m    521\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m    522\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_authorized_user_file\u001b[39m(\u001b[38;5;28mcls\u001b[39m, filename, scopes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m    523\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Creates a Credentials instance from an authorized user json file.\u001b[39;00m\n\u001b[1;32m    524\u001b[0m \n\u001b[1;32m    525\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    535\u001b[0m \u001b[38;5;124;03m        ValueError: If the file is not in the expected format.\u001b[39;00m\n\u001b[1;32m    536\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 537\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[1;32m    538\u001b[0m         data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(json_file)\n\u001b[1;32m    539\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_authorized_user_info(data, scopes)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'"
+     ]
+    }
+   ],
    "source": [
     "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
     "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n",
@@ -152,6 +155,7 @@
     "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
     "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
     "user_df.head()\n",
+    "\n",
     "word_vect = load_word2vec_model\n",
     "\n",
     "# Text preprocessing\n",

From 85b13ceffe7444ed2867aa2e537ffe1c8c548d5e Mon Sep 17 00:00:00 2001
From: mirandadrummond <mdrummond.ieu2021@student.ie.edu>
Date: Sun, 7 Apr 2024 23:49:00 +0200
Subject: [PATCH 3/6] small description additions

---
 .../models/examples/cosine_similarity.ipynb   | 13 ++++++++
 .../models/examples/knn_model.ipynb           | 30 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb
index 55f121a..c9f3638 100644
--- a/codecompasslib/models/examples/cosine_similarity.ipynb
+++ b/codecompasslib/models/examples/cosine_similarity.ipynb
@@ -20,6 +20,19 @@
     "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cosine Similairty Model\n",
+    "\n",
+    "This model utilizes the cosine similarity between the query and the documents to rank the documents. The cosine similarity is calculated as follows:\n",
+    "\n",
+    "- Using NLP and TFIDF, the repository, language and its description are tokenized and vectorized.\n",
+    "- The cosine similarity is calculated.\n",
+    "- The repos are ranked based on the cosine similarity."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb
index 196f470..d3d31e6 100644
--- a/codecompasslib/models/examples/knn_model.ipynb
+++ b/codecompasslib/models/examples/knn_model.ipynb
@@ -22,6 +22,36 @@
     "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Word2Vec\n",
+    "Word2Vec is a method that converts words into numerical vectors, capturing information about their meaning based on the context in which they appear.\n",
+    "\n",
+    "Here’s how it works:\n",
+    "\n",
+    "Initialize a vector for each word randomly.\n",
+    "For each word in the corpus:\n",
+    "Predict the context words (words nearby) given the target word (skip-gram).\n",
+    "Adjust the word vectors to minimize the prediction error.\n",
+    "The learned vectors represent the words’ meanings. These vectors can be used for tasks like document similarity, text classification, and information retrieval.\n",
+    "\n",
+    "In our project we use pre-trained word2vec model, specifically trained on software engineering domain."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### K-Nearest Neighbors (KNN)\n",
+    "KNN is a supervised learning algorithm primarily used for classification based on the similarity of data points. It assumes that similar things tend to be close to each other in the feature space.\n",
+    "\n",
+    "Distance Metric: To measure similarity, we compute the distance between data points. Commonly used metrics include Euclidean distance, Manhattan distance, or cosine similarity.\n",
+    "Prediction: Given a new data point, find its K nearest neighbors based on the chosen distance metric.\n",
+    "In our project we use KNN to find users most similar to our target user."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,

From dd30b8b84bf0f5333c68e507f5191ebe65d4c598 Mon Sep 17 00:00:00 2001
From: mirandadrummond <mdrummond.ieu2021@student.ie.edu>
Date: Sun, 7 Apr 2024 23:50:12 +0200
Subject: [PATCH 4/6] small deletion

---
 .../models/examples/cosine_similarity.ipynb   | 255 ------------------
 1 file changed, 255 deletions(-)
 delete mode 100644 codecompasslib/models/examples/cosine_similarity.ipynb

diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb
deleted file mode 100644
index c9f3638..0000000
--- a/codecompasslib/models/examples/cosine_similarity.ipynb
+++ /dev/null
@@ -1,255 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "import pandas as pd\n",
-    "from typing import Tuple\n",
-    "from pandas import DataFrame\n",
-    "import numpy as np\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
-    "from sklearn.metrics.pairwise import cosine_similarity\n",
-    "\n",
-    "sys.path.append('../../../')\n",
-    "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Cosine Similairty Model\n",
-    "\n",
-    "This model utilizes the cosine similarity between the query and the documents to rank the documents. The cosine similarity is calculated as follows:\n",
-    "\n",
-    "- Using NLP and TFIDF, the repository, language and its description are tokenized and vectorized.\n",
-    "- The cosine similarity is calculated.\n",
-    "- The repos are ranked based on the cosine similarity."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n",
-    "    \"\"\"\n",
-    "    Load the data from the Google Drive\n",
-    "    :return: The non-embedded and embedded datasets\n",
-    "    \"\"\"\n",
-    "    DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n",
-    "    DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n",
-    "\n",
-    "    creds = get_creds_drive()\n",
-    "    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n",
-    "    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n",
-    "\n",
-    "    # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n",
-    "    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n",
-    "    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n",
-    "\n",
-    "    print(\"Data loaded\")\n",
-    "    return df_non_embedded, df_embedded"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
-    "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n",
-    "\n",
-    "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_and_clean_data(df_non_embedded):\n",
-    "    \"\"\"\n",
-    "    Load and clean the dataset from a specified filepath.\n",
-    "    \n",
-    "    Args:\n",
-    "        filepath (str): The file path to the dataset.\n",
-    "\n",
-    "    Returns:\n",
-    "        pandas.DataFrame: The cleaned DataFrame.\n",
-    "    \"\"\"\n",
-    "    # Load the data\n",
-    "    df = df_non_embedded\n",
-    "\n",
-    "    # Delete missing values\n",
-    "    df.dropna(inplace=True)\n",
-    "\n",
-    "    # Delete columns that are not needed\n",
-    "    columns_to_drop = [\n",
-    "        'is_archived', 'is_disabled', 'is_template', 'has_projects',  \n",
-    "        'owner_type', 'has_pages', 'has_wiki', \n",
-    "        'has_issues', 'has_downloads', 'is_fork'\n",
-    "    ]\n",
-    "    df.drop(columns=columns_to_drop, inplace=True)\n",
-    "\n",
-    "    # Handling missing values in text columns\n",
-    "    df['description'].fillna('', inplace=True)\n",
-    "    df['name'].fillna('', inplace=True)\n",
-    "    df['language'].fillna('', inplace=True)\n",
-    "\n",
-    "    # Drop duplicates with name\n",
-    "    df.drop_duplicates(subset='name', keep='first', inplace=True)\n",
-    "\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_cosine_similarity_scores(df):\n",
-    "    \"\"\"\n",
-    "    Calculate cosine similarity scores for the dataset.\n",
-    "\n",
-    "    Args:\n",
-    "        df (pandas.DataFrame): The DataFrame containing repository data.\n",
-    "\n",
-    "    Returns:\n",
-    "        tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n",
-    "    \"\"\"\n",
-    "    # Concatenating the text columns for vectorization\n",
-    "    text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n",
-    "\n",
-    "    # Vectorizing the text data using TF-IDF\n",
-    "    tfidf_vectorizer = TfidfVectorizer()\n",
-    "    tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n",
-    "\n",
-    "    # Calculating cosine similarity\n",
-    "    cosine_sim = cosine_similarity(tfidf_matrix)\n",
-    "\n",
-    "    # Average the cosine similarities for each repo\n",
-    "    similarity_scores = np.mean(cosine_sim, axis=1)\n",
-    "\n",
-    "    # Adding the new column to the dataset\n",
-    "    df['cosine_similarity_score'] = similarity_scores"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_cosine_similarity_scores(df):\n",
-    "    \"\"\"\n",
-    "    Calculate cosine similarity scores for the dataset.\n",
-    "\n",
-    "    Args:\n",
-    "        df (pandas.DataFrame): The DataFrame containing repository data.\n",
-    "\n",
-    "    Returns:\n",
-    "        tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n",
-    "    \"\"\"\n",
-    "    # Concatenating the text columns for vectorization\n",
-    "    text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n",
-    "\n",
-    "    # Vectorizing the text data using TF-IDF\n",
-    "    tfidf_vectorizer = TfidfVectorizer()\n",
-    "    tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n",
-    "\n",
-    "    # Calculating cosine similarity\n",
-    "    cosine_sim = cosine_similarity(tfidf_matrix)\n",
-    "\n",
-    "    # Average the cosine similarities for each repo\n",
-    "    similarity_scores = np.mean(cosine_sim, axis=1)\n",
-    "\n",
-    "    # Adding the new column to the dataset\n",
-    "    df['cosine_similarity_score'] = similarity_scores\n",
-    "\n",
-    "    return df, tfidf_vectorizer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10):\n",
-    "    \"\"\"\n",
-    "    Recommend repositories based on user preferences.\n",
-    "\n",
-    "    Args:\n",
-    "        user_preference (str): The user's preferred keywords or phrases.\n",
-    "        df (pandas.DataFrame): The DataFrame containing repository data.\n",
-    "        tfidf_vectorizer (TfidfVectorizer): The TF-IDF vectorizer used for transforming text data.\n",
-    "        top_n (int, optional): Number of top recommendations to return. Defaults to 10.\n",
-    "\n",
-    "    Returns:\n",
-    "        pandas.DataFrame: DataFrame containing top_n recommended repositories.\n",
-    "    \"\"\"\n",
-    "    # Vectorize the user preference\n",
-    "    user_pref_vector = tfidf_vectorizer.transform([user_preference])\n",
-    "\n",
-    "    # Calculate cosine similarity with all repositories\n",
-    "    cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(df['name'] + \" \" + df['description'] + \" \" + df['language'])).flatten()\n",
-    "\n",
-    "    # Get the indices of the repositories with the highest similarity scores\n",
-    "    top_indices = np.argsort(cosine_scores)[-top_n:][::-1]\n",
-    "\n",
-    "    # Select the top n recommended repositories\n",
-    "    recommended_repos = df.iloc[top_indices].reset_index(drop=True)\n",
-    "\n",
-    "    return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def main(df):\n",
-    "    \"\"\"\n",
-    "    Main function to run the script.\n",
-    "    \"\"\"\n",
-    "    df, tfidf_vectorizer = calculate_cosine_similarity_scores(df)\n",
-    "    user_preference = \"python\"\n",
-    "    recommended_repos = recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10)\n",
-    "    print(recommended_repos)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From c1496c0556bd7a07fb3fa35c24b549013f3d595f Mon Sep 17 00:00:00 2001
From: KTsula <ketisulamanidze@gmail.com>
Date: Tue, 13 Aug 2024 01:06:34 +0400
Subject: [PATCH 5/6] Added new users' repos and tested

---
 .gitignore                                    |   4 +-
 .../embeddings/embeddings_helper_functions.py |  13 +-
 .../embeddings/generate_embedded_dataset.py   |   4 +-
 .../models/examples/knn_model.ipynb           | 312 +++++++++++++-----
 codecompasslib/models/lightgbm_model.py       |  10 +-
 5 files changed, 253 insertions(+), 90 deletions(-)

diff --git a/.gitignore b/.gitignore
index 58391e9..29fb4f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,9 @@ secrets/
 **/__pycache__/
 codecompasslib/API/datasets/**.csv
 codecompasslib/API/**.txt
+codecompasslib/embeddings/**.csv
 dataset_new.csv
 codecompasslib/models/**.csv
 codecompasslib/models/examples/**.csv
-codecompasslib/PretrainedModels/
\ No newline at end of file
+codecompasslib/PretrainedModels/
+**.csv
\ No newline at end of file
diff --git a/codecompasslib/embeddings/embeddings_helper_functions.py b/codecompasslib/embeddings/embeddings_helper_functions.py
index 8a13a2f..3471f3f 100644
--- a/codecompasslib/embeddings/embeddings_helper_functions.py
+++ b/codecompasslib/embeddings/embeddings_helper_functions.py
@@ -1,3 +1,14 @@
+import sys
+import os
+
+# Construct the path to the root directory (one level up from embeddings)
+root_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(root_dir)
+real_project_dir = os.path.dirname(project_dir)
+
+# Add the project directory to the Python path
+sys.path.insert(0, real_project_dir)
+
 import numpy as np
 import pandas as pd
 from gensim.models.keyedvectors import KeyedVectors
@@ -35,7 +46,7 @@ def load_word2vec_model():
     Citation:
         Efstathiou Vasiliki, Chatzilenas Christos, & Spinellis Diomidis. (2018). Word Embeddings for the Software Engineering Domain [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1199620
     """
-    word_vect = KeyedVectors.load_word2vec_format("./codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True)
+    word_vect = KeyedVectors.load_word2vec_format("codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True)
     return word_vect
 
 # Vectorizing text using domain specific word2vec model
diff --git a/codecompasslib/embeddings/generate_embedded_dataset.py b/codecompasslib/embeddings/generate_embedded_dataset.py
index aaa05a7..ce8bd9a 100644
--- a/codecompasslib/embeddings/generate_embedded_dataset.py
+++ b/codecompasslib/embeddings/generate_embedded_dataset.py
@@ -90,13 +90,13 @@ def generate_openAI_embedded_csv(df, column_to_embed):
         
         # Save the current batch DataFrame to a CSV file
         # Mode 'a' is for append, header=False to avoid writing headers multiple times
-        batch_df.to_csv('df_embedded_0504_batch.csv', mode='a', header=not i, index=False)
+        batch_df.to_csv('df_embedded_1208_batch.csv', mode='a', header=not i, index=False)
         
         # Optional: Free up memory by deleting the batch DataFrame if no longer needed
         del batch_df
     
     # Load the CSV file with the embeddings
-    df_with_embeddings = pd.read_csv('df_embedded_0504_batch.csv')
+    df_with_embeddings = pd.read_csv('df_embedded_1208_batch.csv')
     return df_with_embeddings
     
 def main():
diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb
index d3d31e6..9619501 100644
--- a/codecompasslib/models/examples/knn_model.ipynb
+++ b/codecompasslib/models/examples/knn_model.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -54,109 +54,257 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 23,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Download 11%.\n",
+      "\n",
+      "Download 23%.\n",
+      "\n",
+      "Download 35%.\n",
+      "\n",
+      "Download 47%.\n",
+      "\n",
+      "Download 59%.\n",
+      "\n",
+      "Download 71%.\n",
+      "\n",
+      "Download 83%.\n",
+      "\n",
+      "Download 95%.\n",
+      "\n",
+      "Download 100%.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\API\\drive_operations.py:88: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  return read_csv(fh)\n"
+     ]
+    }
+   ],
    "source": [
-    "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n",
-    "    \"\"\"\n",
-    "    Load the data from the Google Drive\n",
-    "    :return: The non-embedded and embedded datasets\n",
-    "    \"\"\"\n",
-    "    DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n",
-    "    DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n",
-    "\n",
-    "    creds = get_creds_drive()\n",
-    "    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n",
-    "    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n",
-    "\n",
-    "    # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n",
-    "    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n",
-    "    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n",
+    "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
     "\n",
-    "    print(\"Data loaded\")\n",
-    "    return df_non_embedded, df_embedded"
+    "df_non_embedded = download_csv_as_pd_dataframe(creds=get_creds_drive(), file_id=full_data_folder_id)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "usecols=['owner_user', 'name', 'description', 'language']\n",
+    "# drop every column except for these\n",
+    "df = df_non_embedded.copy()\n",
+    "df = df[usecols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
    "metadata": {},
    "outputs": [
     {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m full_data_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m      2\u001b[0m full_data_embedded_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m139wi78iRzhwGZwxmI5WALoYocR-Rk9By\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m df_non_embedded, df_embedded \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_data_folder_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfull_data_embedded_folder_id\u001b[49m\u001b[43m)\u001b[49m\n",
-      "Cell \u001b[0;32mIn[5], line 9\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m(full_data_folder_id, full_data_embedded_folder_id)\u001b[0m\n\u001b[1;32m      6\u001b[0m DRIVE_ID \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0AL1DtB4TdEWdUk9PVA\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      7\u001b[0m DATA_FOLDER \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 9\u001b[0m creds \u001b[38;5;241m=\u001b[39m \u001b[43mget_creds_drive\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m df_non_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_folder_id)\n\u001b[1;32m     11\u001b[0m df_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_embedded_folder_id)\n",
-      "File \u001b[0;32m~/VSCode/CodeCompass/codecompasslib/models/examples/../../../codecompasslib/API/drive_operations.py:24\u001b[0m, in \u001b[0;36mget_creds_drive\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_creds_drive\u001b[39m() \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Credentials:\n\u001b[1;32m     20\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;124;03m    Get the credentials for the Google Drive API\u001b[39;00m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;124;03m    :return: None\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m     creds: Credentials \u001b[38;5;241m=\u001b[39m \u001b[43mCredentials\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_authorized_user_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mOUTER_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/secrets/token.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mSCOPES\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m     26\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m creds \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mexpired \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mrefresh_token:\n",
-      "File \u001b[0;32m~/VSCode/CodeCompass/.venv/lib/python3.11/site-packages/google/oauth2/credentials.py:537\u001b[0m, in \u001b[0;36mCredentials.from_authorized_user_file\u001b[0;34m(cls, filename, scopes)\u001b[0m\n\u001b[1;32m    521\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m    522\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_authorized_user_file\u001b[39m(\u001b[38;5;28mcls\u001b[39m, filename, scopes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m    523\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Creates a Credentials instance from an authorized user json file.\u001b[39;00m\n\u001b[1;32m    524\u001b[0m \n\u001b[1;32m    525\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    535\u001b[0m \u001b[38;5;124;03m        ValueError: If the file is not in the expected format.\u001b[39;00m\n\u001b[1;32m    536\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 537\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[1;32m    538\u001b[0m         data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(json_file)\n\u001b[1;32m    539\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_authorized_user_info(data, scopes)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'"
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>owner_user</th>\n",
+       "      <th>name</th>\n",
+       "      <th>description</th>\n",
+       "      <th>language</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>Automation_Project</td>\n",
+       "      <td>Automated Bash Script to automate log Backup g...</td>\n",
+       "      <td>Shell</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>IKON</td>\n",
+       "      <td>CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...</td>\n",
+       "      <td>Python</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>javamavensonarrgohelmk8</td>\n",
+       "      <td>No description</td>\n",
+       "      <td>HTML</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>node001</td>\n",
+       "      <td>files_repo</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Rameshwar0852</td>\n",
+       "      <td>nodeandjs</td>\n",
+       "      <td>node java script application</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583820</th>\n",
+       "      <td>pinax</td>\n",
+       "      <td>pinax-blog</td>\n",
+       "      <td>a blog app for Django</td>\n",
+       "      <td>Python</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583821</th>\n",
+       "      <td>montylounge</td>\n",
+       "      <td>django-mingus</td>\n",
+       "      <td>a Django blog engine leveraging reusable apps ...</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583822</th>\n",
+       "      <td>WuXianglong</td>\n",
+       "      <td>GeekBlog</td>\n",
+       "      <td>A full blog system based on Django</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583823</th>\n",
+       "      <td>NARKOZ</td>\n",
+       "      <td>hacker-scripts</td>\n",
+       "      <td>Based on a true story</td>\n",
+       "      <td>JavaScript</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2583824</th>\n",
+       "      <td>matthewbdaly</td>\n",
+       "      <td>django_tutorial_blog_ng</td>\n",
+       "      <td>The source for the new version of my Django tu...</td>\n",
+       "      <td>Python</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1524223 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            owner_user                     name  \\\n",
+       "4        Rameshwar0852       Automation_Project   \n",
+       "18       Rameshwar0852                     IKON   \n",
+       "19       Rameshwar0852  javamavensonarrgohelmk8   \n",
+       "23       Rameshwar0852                  node001   \n",
+       "24       Rameshwar0852                nodeandjs   \n",
+       "...                ...                      ...   \n",
+       "2583820          pinax               pinax-blog   \n",
+       "2583821    montylounge            django-mingus   \n",
+       "2583822    WuXianglong                 GeekBlog   \n",
+       "2583823         NARKOZ           hacker-scripts   \n",
+       "2583824   matthewbdaly  django_tutorial_blog_ng   \n",
+       "\n",
+       "                                               description    language  \n",
+       "4        Automated Bash Script to automate log Backup g...       Shell  \n",
+       "18       CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...      Python  \n",
+       "19                                          No description        HTML  \n",
+       "23                                              files_repo  JavaScript  \n",
+       "24                           node java script application   JavaScript  \n",
+       "...                                                    ...         ...  \n",
+       "2583820                              a blog app for Django      Python  \n",
+       "2583821  a Django blog engine leveraging reusable apps ...  JavaScript  \n",
+       "2583822                 A full blog system based on Django  JavaScript  \n",
+       "2583823                              Based on a true story  JavaScript  \n",
+       "2583824  The source for the new version of my Django tu...      Python  \n",
+       "\n",
+       "[1524223 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
-    "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n",
-    "\n",
-    "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)"
+    "df.dropna()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def load_and_clean_data(df_non_embedded):\n",
-    "    \"\"\"\n",
-    "    Load and clean the dataset from a specified filepath.\n",
-    "    \n",
-    "    Args:\n",
-    "        filepath (str): The file path to the dataset.\n",
-    "\n",
-    "    Returns:\n",
-    "        pandas.DataFrame: The cleaned DataFrame.\n",
-    "    \"\"\"\n",
-    "    # Load the data\n",
-    "    df = df_non_embedded\n",
-    "\n",
-    "    # Delete missing values\n",
-    "    df.dropna(inplace=True)\n",
-    "\n",
-    "    # Delete columns that are not needed\n",
-    "    columns_to_drop = [\n",
-    "        'is_archived', 'is_disabled', 'is_template', 'has_projects',  \n",
-    "        'owner_type', 'has_pages', 'has_wiki', \n",
-    "        'has_issues', 'has_downloads', 'is_fork'\n",
-    "    ]\n",
-    "    df.drop(columns=columns_to_drop, inplace=True)\n",
-    "\n",
-    "    # Handling missing values in text columns\n",
-    "    df['description'].fillna('', inplace=True)\n",
-    "    df['name'].fillna('', inplace=True)\n",
-    "    df['language'].fillna('', inplace=True)\n",
-    "\n",
-    "    # Drop duplicates with name\n",
-    "    df.drop_duplicates(subset='name', keep='first', inplace=True)\n",
-    "\n",
-    "    return df"
+    "# convert language column to string type\n",
+    "df['language'] = df['language'].astype(str)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 54,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File not found.\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'NoneType' object has no attribute 'vector_size'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[54], line 39\u001b[0m\n\u001b[0;32m     35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43membedded_user_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mname\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m     41\u001b[0m embedded_user_df\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\series.py:4908\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m   4780\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m   4781\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   4782\u001b[0m     func: AggFuncType,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   4787\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m   4788\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m   4789\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   4790\u001b[0m \u001b[38;5;124;03m    Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m   4791\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   4906\u001b[0m \u001b[38;5;124;03m    dtype: float64\u001b[39;00m\n\u001b[0;32m   4907\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 4908\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   4909\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4910\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4911\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4912\u001b[0m \u001b[43m        \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4913\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4914\u001b[0m \u001b[43m        \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   4915\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1424\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m   1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m   1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m   1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m   1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m   1505\u001b[0m \u001b[38;5;66;03m#  Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m   1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1508\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m   1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m   1512\u001b[0m     \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m   1513\u001b[0m     \u001b[38;5;66;03m#  See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m   1514\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m    918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m    919\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m   1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m   1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   1745\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m   1746\u001b[0m         values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m   1747\u001b[0m     )\n",
+      "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n",
+      "Cell \u001b[1;32mIn[54], line 39\u001b[0m, in \u001b[0;36m<lambda>\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m     35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m     40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m     41\u001b[0m embedded_user_df\n",
+      "File \u001b[1;32mc:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\models\\model_diff_repos.py:60\u001b[0m, in \u001b[0;36mvectorize_text\u001b[1;34m(text, word_vect)\u001b[0m\n\u001b[0;32m     58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvectorize_text\u001b[39m(text, word_vect):\n\u001b[0;32m     59\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m         vector_sum \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(\u001b[43mword_vect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector_size\u001b[49m)  \u001b[38;5;66;03m# Initialize an array to store the sum of word vectors\u001b[39;00m\n\u001b[0;32m     61\u001b[0m         count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m  \u001b[38;5;66;03m# Initialize a count to keep track of the number of words found in the vocabulary\u001b[39;00m\n\u001b[0;32m     62\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit():\n",
+      "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'vector_size'"
+     ]
+    }
+   ],
    "source": [
-    "df = load_and_clean_data(df_non_embedded)\n",
-    "\n",
-    "# count unique languges\n",
-    "df['language'].nunique()\n",
-    "\n",
     "# Create list of unique languages with _ prefix\n",
     "languages = ['_' + language for language in df['language'].unique()]\n",
     "\n",
@@ -193,8 +341,10 @@
     "embedded_user_df['name'] = user_df['name'].fillna('')  \n",
     "embedded_user_df['description'] = user_df['description'].fillna('')\n",
     "\n",
-    "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)\n",
-    "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)\n",
+    "\n",
+    "word2vec_model = load_word2vec_model()\n",
+    "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(lambda x: vectorize_text(x, word2vec_model))\n",
+    "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(lambda x: vectorize_text(x, word2vec_model))\n",
     "embedded_user_df\n",
     "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)"
    ]
@@ -256,7 +406,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py
index 9d21174..7a1ba0e 100644
--- a/codecompasslib/models/lightgbm_model.py
+++ b/codecompasslib/models/lightgbm_model.py
@@ -128,13 +128,13 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu
     :return: The non-embedded and embedded datasets
     """
 
-    creds = get_creds_drive()
-    df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
-    df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
+    # creds = get_creds_drive()
+    # df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
+    # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
 
     # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data
-    # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
-    # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')
+    df_non_embedded = pd.read_csv('codecompasslib/models/data_full_new.csv')
+    df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined_new.csv')
 
     print("Data loaded")
     return df_non_embedded, df_embedded

From ebc0d7bb4b16f3ba06178fdb8e295b645e4c0ff9 Mon Sep 17 00:00:00 2001
From: KTsula <ketisulamanidze@gmail.com>
Date: Thu, 15 Aug 2024 02:41:30 +0400
Subject: [PATCH 6/6] Add caching mechanism with pickle

---
 codecompasslib/API/helper_functions.py   |  29 +++++++++++++++++++++++
 codecompasslib/models/lightgbm_model.py  |  14 ++++++-----
 codecompasslib/recommendations_cache.pkl | Bin 0 -> 1180 bytes
 frontend/recommender/app.py              |  12 +++++++---
 4 files changed, 46 insertions(+), 9 deletions(-)
 create mode 100644 codecompasslib/recommendations_cache.pkl

diff --git a/codecompasslib/API/helper_functions.py b/codecompasslib/API/helper_functions.py
index 4b04da0..dfab9c7 100644
--- a/codecompasslib/API/helper_functions.py
+++ b/codecompasslib/API/helper_functions.py
@@ -1,3 +1,5 @@
+import pickle
+import os
 from json import load
 from pandas import DataFrame
 from os.path import dirname
@@ -17,6 +19,33 @@ def save_to_csv(data: any, filename: str) -> None:
     df: DataFrame = DataFrame(data)
     df.to_csv(Path(PARENT_PATH + '/Data/' + filename), index=False)
 
+def save_cache(cache_data: dict, cache_filename: str):
+    """
+    Save a dictionary to a file in pickle format.
+    
+    :param cache_data: The dictionary to be saved.
+    :param cache_filename: The name of the file where the cache will be saved.
+    """
+    with open(cache_filename, 'wb') as cache_file:
+        pickle.dump(cache_data, cache_file)
+    print(f"Cache saved to {cache_filename}")
+
+def load_cache(cache_filename: str) -> dict:
+    """
+    Load a dictionary from a pickle file.
+    
+    :param cache_filename: The name of the file where the cache is stored.
+    :return: The loaded dictionary.
+    """
+    if os.path.exists(cache_filename):
+        with open(cache_filename, 'rb') as cache_file:
+            cache_data = pickle.load(cache_file)
+        print(f"Cache loaded from {cache_filename}")
+        return cache_data
+    else:
+        print(f"No cache found at {cache_filename}")
+        return {}
+
 
 def list_to_txt(data: list, file_name: str) -> bool:
     """
diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py
index 7a1ba0e..7d64b7f 100644
--- a/codecompasslib/models/lightgbm_model.py
+++ b/codecompasslib/models/lightgbm_model.py
@@ -19,7 +19,7 @@
 
 from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive
 from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos
-
+from codecompasslib.API.helper_functions import save_cache, load_cache
 
 def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]:
     """
@@ -38,7 +38,6 @@ def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tupl
     del df[label_col]
     return df, y
 
-
 def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Booster, ordinal.OrdinalEncoder]:
     """
     Trains a LightGBM model using the provided merged dataframe.
@@ -133,8 +132,8 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu
     # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
 
     # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data
-    df_non_embedded = pd.read_csv('codecompasslib/models/data_full_new.csv')
-    df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined_new.csv')
+    df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
+    df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')
 
     print("Data loaded")
     return df_non_embedded, df_embedded
@@ -166,12 +165,13 @@ def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame,
     # Add target column: 1 if the repo is starred or owned by the user, else 0
     owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]]
     starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]]
+    print("Owned length: ", len(owned_by_target_repo_ids))
+    print("Starred length: ", len(starred_repo_ids))
     starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids
     df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0)
 
     return df_merged, starred_or_owned_by_user
 
-
 def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFrame,
                                       df_embedded: DataFrame, number_of_recommendations: int = 10) -> list:
     """
@@ -217,5 +217,7 @@ def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFra
         else:
             counter += 1
             recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index]))
-
+    cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl')
+    cached_recommendations[target_user] = recommendations
+    save_cache(cached_recommendations, 'codecompasslib/recommendations_cache.pkl')
     return recommendations
diff --git a/codecompasslib/recommendations_cache.pkl b/codecompasslib/recommendations_cache.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..ef41e7b23debd1252c53b81c729f4ee2439a5634
GIT binary patch
literal 1180
zcmaje&ui2`6bJBVyFaRK_0mHr9t*v=2!iz>*iw4g;!^c2@{&w4W0INeWM;QXu`CL<
zg-{Sj_74yhgkDtqaa+83kscMnlP6EbYY&=TC+%hFyh8$$_sN%;<aNiNJsG!#^+3&K
zwTSo<g}z4+=S<zrm!_zUT1nku9MvNk3Id!%qOx-AKnPqtRat4+lHXmCw9sJXw0BMI
zY;URCYHnpLJ9S+(o6W}GyoQ$*ODKD(YFXV!m1=)cJ<j{G$pDiScuZWEtVyD937;MG
z4C>&<uxIOd@!j<Kj~Z5D3@K;y_~A(LbLVic5X&7zJ75yKKYqW_qDmQHpLpWrknY=i
zrO%p_eG+%p;*){Zf6rVE&a{x=Oi}_SwEBAEp+*xCqk{Pe1%{w+hsg_VDgmUI5@t$i
z%rsvWd%;N(c^yMU<LlL8^A!ludWyZ#0%f%j*&L|jFxt+yoBwA=pL>7vPg<Nv1VZ5a
zJe*|oi;mF2_lqr!&Uqxn%mpgjC^E3ivsRH+<0Qrsgii#Pw!tlYS-+*@s*sXY^7(lK
zv3}Y0g4mo28zF8W(CdBBr0gSp(@hOvb#L~i29w@kfGKf+yDVk~xZzy+q+x9_fEX{`
p0eT;yw!vBb-u}Y>@h#b5LQNBOVfp#C&Y5sy2NN2gaC8Qe^?yjkj#&Tz

literal 0
HcmV?d00001

diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py
index 68b8cfc..754994f 100644
--- a/frontend/recommender/app.py
+++ b/frontend/recommender/app.py
@@ -13,6 +13,7 @@
 
 # Import necessary functions from codecompasslib
 from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data
+from codecompasslib.API.helper_functions import load_cache
 
 # Function to load cached data
 def load_cached_data():
@@ -24,6 +25,7 @@ def load_cached_data():
             full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'
             st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id)
     return st.session_state.cached_data
+    
 
 def main():
     # Load the data
@@ -41,9 +43,13 @@ def main():
         if target_user not in df_embedded['owner_user'].values:
             st.error("User not found in the dataset. Please enter a valid username.")
         else:
-            # Generate recommendations
-            with st.spinner('Generating recommendations...'):
-                recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)
+            cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl')
+            if target_user in cached_recommendations.keys():
+                recommendations = cached_recommendations[target_user]
+            else:
+                # Generate recommendations
+                with st.spinner('Generating recommendations...'):
+                    recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)
             
             # Display recommendations
             st.subheader("Recommendations")