From 8ca5f04967d72760268f44581ee92f3fd0b866c4 Mon Sep 17 00:00:00 2001 From: mirandadrummond Date: Sun, 7 Apr 2024 23:21:03 +0200 Subject: [PATCH 1/6] adding cosine and KNN as ipynb --- .../models/examples/cosine_similarity.ipynb | 254 ++++++++++++++++++ .../models/examples/knn_model.ipynb | 230 ++++++++++++++++ 2 files changed, 484 insertions(+) create mode 100644 codecompasslib/models/examples/cosine_similarity.ipynb create mode 100644 codecompasslib/models/examples/knn_model.ipynb diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb new file mode 100644 index 0000000..9da1581 --- /dev/null +++ b/codecompasslib/models/examples/cosine_similarity.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'codecompasslib'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "\n", + "import pandas as pd\n", + "from typing import Tuple\n", + "from pandas import DataFrame\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "\n", + "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n", + " \"\"\"\n", + " Load the data from the Google Drive\n", + " :return: The non-embedded and embedded datasets\n", + " \"\"\"\n", + " DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n", + " DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n", + "\n", + " creds = get_creds_drive()\n", + " df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n", + " df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n", + "\n", + " # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n", + " # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n", + " # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n", + "\n", + " print(\"Data loaded\")\n", + " return df_non_embedded, df_embedded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", + "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n", + "\n", + "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_and_clean_data(df_non_embedded):\n", + " \"\"\"\n", + " Load and clean the dataset from a specified filepath.\n", + " \n", + " Args:\n", + " filepath (str): The file path to the dataset.\n", + "\n", + " Returns:\n", + " pandas.DataFrame: The cleaned DataFrame.\n", + " \"\"\"\n", + " # Load the data\n", + " df = df_non_embedded\n", + "\n", + " # Delete missing values\n", + " df.dropna(inplace=True)\n", + "\n", + " # Delete columns that are not needed\n", + " columns_to_drop = [\n", + " 'is_archived', 'is_disabled', 'is_template', 'has_projects', \n", + " 'owner_type', 'has_pages', 'has_wiki', \n", + " 'has_issues', 'has_downloads', 'is_fork'\n", + " ]\n", + " df.drop(columns=columns_to_drop, inplace=True)\n", + "\n", + " # Handling missing values in text columns\n", + " df['description'].fillna('', inplace=True)\n", + " df['name'].fillna('', inplace=True)\n", + " df['language'].fillna('', inplace=True)\n", + "\n", + " # Drop duplicates with name\n", + " df.drop_duplicates(subset='name', keep='first', inplace=True)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_cosine_similarity_scores(df):\n", + " \"\"\"\n", + " Calculate cosine similarity scores for the dataset.\n", + "\n", + " Args:\n", + " df (pandas.DataFrame): The DataFrame containing repository data.\n", + "\n", + " Returns:\n", + " tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n", + " \"\"\"\n", + " # Concatenating the text columns for vectorization\n", + " text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n", + "\n", + " # Vectorizing the text data using TF-IDF\n", + " tfidf_vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n", + "\n", + " # Calculating cosine similarity\n", + " cosine_sim = cosine_similarity(tfidf_matrix)\n", + "\n", + " # Average the cosine similarities for each repo\n", + " similarity_scores = np.mean(cosine_sim, axis=1)\n", + "\n", + " # Adding the new column to the dataset\n", + " df['cosine_similarity_score'] = similarity_scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_cosine_similarity_scores(df):\n", + " \"\"\"\n", + " Calculate cosine similarity scores for the dataset.\n", + "\n", + " Args:\n", + " df (pandas.DataFrame): The DataFrame containing repository data.\n", + "\n", + " Returns:\n", + " tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n", + " \"\"\"\n", + " # Concatenating the text columns for vectorization\n", + " text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n", + "\n", + " # Vectorizing the text data using TF-IDF\n", + " tfidf_vectorizer = TfidfVectorizer()\n", + " tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n", + "\n", + " # Calculating cosine similarity\n", + " cosine_sim = cosine_similarity(tfidf_matrix)\n", + "\n", + " # Average the cosine similarities for each repo\n", + " similarity_scores = np.mean(cosine_sim, axis=1)\n", + "\n", + " # Adding the new column to the dataset\n", + " df['cosine_similarity_score'] = similarity_scores\n", + "\n", + " return df, tfidf_vectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10):\n", + " \"\"\"\n", + " Recommend repositories based on user preferences.\n", + "\n", + " Args:\n", + " user_preference (str): The user's preferred keywords or phrases.\n", + " df (pandas.DataFrame): The DataFrame containing repository data.\n", + " tfidf_vectorizer (TfidfVectorizer): The TF-IDF vectorizer used for transforming text data.\n", + " top_n (int, optional): Number of top recommendations to return. Defaults to 10.\n", + "\n", + " Returns:\n", + " pandas.DataFrame: DataFrame containing top_n recommended repositories.\n", + " \"\"\"\n", + " # Vectorize the user preference\n", + " user_pref_vector = tfidf_vectorizer.transform([user_preference])\n", + "\n", + " # Calculate cosine similarity with all repositories\n", + " cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(df['name'] + \" \" + df['description'] + \" \" + df['language'])).flatten()\n", + "\n", + " # Get the indices of the repositories with the highest similarity scores\n", + " top_indices = np.argsort(cosine_scores)[-top_n:][::-1]\n", + "\n", + " # Select the top n recommended repositories\n", + " recommended_repos = df.iloc[top_indices].reset_index(drop=True)\n", + "\n", + " return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def main(df):\n", + " \"\"\"\n", + " Main function to run the script.\n", + " \"\"\"\n", + " df, tfidf_vectorizer = calculate_cosine_similarity_scores(df)\n", + " user_preference = \"python\"\n", + " recommended_repos = recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10)\n", + " print(recommended_repos)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb new file mode 100644 index 0000000..5d7c688 --- /dev/null +++ b/codecompasslib/models/examples/knn_model.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'codecompasslib'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mneighbors\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NearestNeighbors\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n\u001b[1;32m 13\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_word2vec_model\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "\n", + "import pandas as pd\n", + "from typing import Tuple\n", + "from pandas import DataFrame\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from sklearn.neighbors import NearestNeighbors\n", + "\n", + "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n", + "sys.path.append('../../')\n", + "from codecompasslib.models.embeddings import load_word2vec_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n", + " \"\"\"\n", + " Load the data from the Google Drive\n", + " :return: The non-embedded and embedded datasets\n", + " \"\"\"\n", + " DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n", + " DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n", + "\n", + " creds = get_creds_drive()\n", + " df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n", + " df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n", + "\n", + " # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n", + " # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n", + " # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n", + "\n", + " print(\"Data loaded\")\n", + " return df_non_embedded, df_embedded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", + "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n", + "\n", + "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_and_clean_data(df_non_embedded):\n", + " \"\"\"\n", + " Load and clean the dataset from a specified filepath.\n", + " \n", + " Args:\n", + " filepath (str): The file path to the dataset.\n", + "\n", + " Returns:\n", + " pandas.DataFrame: The cleaned DataFrame.\n", + " \"\"\"\n", + " # Load the data\n", + " df = df_non_embedded\n", + "\n", + " # Delete missing values\n", + " df.dropna(inplace=True)\n", + "\n", + " # Delete columns that are not needed\n", + " columns_to_drop = [\n", + " 'is_archived', 'is_disabled', 'is_template', 'has_projects', \n", + " 'owner_type', 'has_pages', 'has_wiki', \n", + " 'has_issues', 'has_downloads', 'is_fork'\n", + " ]\n", + " df.drop(columns=columns_to_drop, inplace=True)\n", + "\n", + " # Handling missing values in text columns\n", + " df['description'].fillna('', inplace=True)\n", + " df['name'].fillna('', inplace=True)\n", + " df['language'].fillna('', inplace=True)\n", + "\n", + " # Drop duplicates with name\n", + " df.drop_duplicates(subset='name', keep='first', inplace=True)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = load_and_clean_data(df_non_embedded)\n", + "\n", + "# count unique languges\n", + "df['language'].nunique()\n", + "\n", + "# Create list of unique languages with _ prefix\n", + "languages = ['_' + language for language in df['language'].unique()]\n", + "\n", + "# one hot encode the languages and don't include the language prefix\n", + "df = pd.get_dummies(df, columns=['language'], prefix='')\n", + "\n", + "# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo\n", + "\n", + "# Create a dictionary for aggregation\n", + "aggregation_dict = {\n", + " 'name': lambda x: list(x),\n", + " 'description': lambda x: list(x)\n", + "}\n", + "\n", + "# Add columns for languages\n", + "for lang in languages:\n", + " aggregation_dict[lang] = 'max'\n", + "\n", + "# Group by 'owner_user' and aggregate\n", + "user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()\n", + "\n", + "# Display the first few rows of the resulting DataFrame\n", + "user_df.head()\n", + "\n", + "# first we turn list of names and descriptions into a single string\n", + "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n", + "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n", + "user_df.head()\n", + "word_vect = load_word2vec_model\n", + "\n", + "# Text preprocessing\n", + "embedded_user_df = user_df.copy()\n", + "embedded_user_df['name'] = user_df['name'].fillna('') \n", + "embedded_user_df['description'] = user_df['description'].fillna('')\n", + "\n", + "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)\n", + "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)\n", + "embedded_user_df\n", + "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform df into something that KNN can use. To be more specific, into a feature matrix\n", + "# Create a list of all the vectors\n", + "vectors = []\n", + "repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1\n", + "\n", + "for row in repo_df.index: \n", + " vector = []\n", + " for columns in ['name_vector', 'description_vector']:\n", + " if type(repo_df.at[row, columns]) == np.ndarray:\n", + " for element in repo_df.at[row, columns]:\n", + " vector.append(element)\n", + " else: vector.append(repo_df.at[row, columns])\n", + " vectors.append(vector)\n", + "\n", + " # Train Nearest Neighbors Model\n", + "k = 5 # Number of neighbors to find\n", + "nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')\n", + "nn_model.fit(vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example Usage\n", + "\n", + "target_user = 21\n", + "# neighbors excluding the target user\n", + "neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]\n", + "neighbors" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 86a3bba780231146b651a2a8b60a874d42972952 Mon Sep 17 00:00:00 2001 From: mirandadrummond Date: Sun, 7 Apr 2024 23:25:04 +0200 Subject: [PATCH 2/6] clean up --- .../models/cosine_similarity_model.py | 2 +- .../models/examples/cosine_similarity.ipynb | 18 ++------ .../models/examples/knn_model.ipynb | 42 ++++++++++--------- 3 files changed, 27 insertions(+), 35 deletions(-) diff --git a/codecompasslib/models/cosine_similarity_model.py b/codecompasslib/models/cosine_similarity_model.py index f81187a..0253ab7 100644 --- a/codecompasslib/models/cosine_similarity_model.py +++ b/codecompasslib/models/cosine_similarity_model.py @@ -35,7 +35,7 @@ def load_and_clean_data(filepath): pandas.DataFrame: The cleaned DataFrame. """ # Load the data - df = pd.read_csv(filepath) + df = pd.read_csv(filepath) # Delete missing values df.dropna(inplace=True) diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb index 9da1581..55f121a 100644 --- a/codecompasslib/models/examples/cosine_similarity.ipynb +++ b/codecompasslib/models/examples/cosine_similarity.ipynb @@ -2,21 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'codecompasslib'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtext\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import sys\n", @@ -28,7 +16,7 @@ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", - "\n", + "sys.path.append('../../../')\n", "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive" ] }, diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb index 5d7c688..196f470 100644 --- a/codecompasslib/models/examples/knn_model.ipynb +++ b/codecompasslib/models/examples/knn_model.ipynb @@ -2,21 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'codecompasslib'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpairwise\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cosine_similarity\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mneighbors\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NearestNeighbors\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mAPI\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdrive_operations\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m download_csv_as_pd_dataframe, get_creds_drive\n\u001b[1;32m 13\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcodecompasslib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01membeddings\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_word2vec_model\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'codecompasslib'" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import sys\n", @@ -29,14 +17,14 @@ "from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.neighbors import NearestNeighbors\n", "\n", + "sys.path.append('../../../')\n", "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n", - "sys.path.append('../../')\n", - "from codecompasslib.models.embeddings import load_word2vec_model" + "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -62,9 +50,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m full_data_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 2\u001b[0m full_data_embedded_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m139wi78iRzhwGZwxmI5WALoYocR-Rk9By\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m df_non_embedded, df_embedded \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_data_folder_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfull_data_embedded_folder_id\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[5], line 9\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m(full_data_folder_id, full_data_embedded_folder_id)\u001b[0m\n\u001b[1;32m 6\u001b[0m DRIVE_ID \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0AL1DtB4TdEWdUk9PVA\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 7\u001b[0m DATA_FOLDER \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 9\u001b[0m creds \u001b[38;5;241m=\u001b[39m \u001b[43mget_creds_drive\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m df_non_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_folder_id)\n\u001b[1;32m 11\u001b[0m df_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_embedded_folder_id)\n", + "File \u001b[0;32m~/VSCode/CodeCompass/codecompasslib/models/examples/../../../codecompasslib/API/drive_operations.py:24\u001b[0m, in \u001b[0;36mget_creds_drive\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_creds_drive\u001b[39m() \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Credentials:\n\u001b[1;32m 20\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;124;03m Get the credentials for the Google Drive API\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;124;03m :return: None\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m creds: Credentials \u001b[38;5;241m=\u001b[39m \u001b[43mCredentials\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_authorized_user_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mOUTER_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/secrets/token.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mSCOPES\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m creds \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mexpired \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mrefresh_token:\n", + "File \u001b[0;32m~/VSCode/CodeCompass/.venv/lib/python3.11/site-packages/google/oauth2/credentials.py:537\u001b[0m, in \u001b[0;36mCredentials.from_authorized_user_file\u001b[0;34m(cls, filename, scopes)\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_authorized_user_file\u001b[39m(\u001b[38;5;28mcls\u001b[39m, filename, scopes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 523\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Creates a Credentials instance from an authorized user json file.\u001b[39;00m\n\u001b[1;32m 524\u001b[0m \n\u001b[1;32m 525\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;124;03m ValueError: If the file is not in the expected format.\u001b[39;00m\n\u001b[1;32m 536\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 537\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[1;32m 538\u001b[0m data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(json_file)\n\u001b[1;32m 539\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_authorized_user_info(data, scopes)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'" + ] + } + ], "source": [ "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n", @@ -152,6 +155,7 @@ "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n", "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n", "user_df.head()\n", + "\n", "word_vect = load_word2vec_model\n", "\n", "# Text preprocessing\n", From 85b13ceffe7444ed2867aa2e537ffe1c8c548d5e Mon Sep 17 00:00:00 2001 From: mirandadrummond Date: Sun, 7 Apr 2024 23:49:00 +0200 Subject: [PATCH 3/6] small description additions --- .../models/examples/cosine_similarity.ipynb | 13 ++++++++ .../models/examples/knn_model.ipynb | 30 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb index 55f121a..c9f3638 100644 --- a/codecompasslib/models/examples/cosine_similarity.ipynb +++ b/codecompasslib/models/examples/cosine_similarity.ipynb @@ -20,6 +20,19 @@ "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cosine Similairty Model\n", + "\n", + "This model utilizes the cosine similarity between the query and the documents to rank the documents. The cosine similarity is calculated as follows:\n", + "\n", + "- Using NLP and TFIDF, the repository, language and its description are tokenized and vectorized.\n", + "- The cosine similarity is calculated.\n", + "- The repos are ranked based on the cosine similarity." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb index 196f470..d3d31e6 100644 --- a/codecompasslib/models/examples/knn_model.ipynb +++ b/codecompasslib/models/examples/knn_model.ipynb @@ -22,6 +22,36 @@ "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word2Vec\n", + "Word2Vec is a method that converts words into numerical vectors, capturing information about their meaning based on the context in which they appear.\n", + "\n", + "Here’s how it works:\n", + "\n", + "Initialize a vector for each word randomly.\n", + "For each word in the corpus:\n", + "Predict the context words (words nearby) given the target word (skip-gram).\n", + "Adjust the word vectors to minimize the prediction error.\n", + "The learned vectors represent the words’ meanings. These vectors can be used for tasks like document similarity, text classification, and information retrieval.\n", + "\n", + "In our project we use pre-trained word2vec model, specifically trained on software engineering domain." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### K-Nearest Neighbors (KNN)\n", + "KNN is a supervised learning algorithm primarily used for classification based on the similarity of data points. It assumes that similar things tend to be close to each other in the feature space.\n", + "\n", + "Distance Metric: To measure similarity, we compute the distance between data points. Commonly used metrics include Euclidean distance, Manhattan distance, or cosine similarity.\n", + "Prediction: Given a new data point, find its K nearest neighbors based on the chosen distance metric.\n", + "In our project we use KNN to find users most similar to our target user." + ] + }, { "cell_type": "code", "execution_count": 5, From dd30b8b84bf0f5333c68e507f5191ebe65d4c598 Mon Sep 17 00:00:00 2001 From: mirandadrummond Date: Sun, 7 Apr 2024 23:50:12 +0200 Subject: [PATCH 4/6] small deletion --- .../models/examples/cosine_similarity.ipynb | 255 ------------------ 1 file changed, 255 deletions(-) delete mode 100644 codecompasslib/models/examples/cosine_similarity.ipynb diff --git a/codecompasslib/models/examples/cosine_similarity.ipynb b/codecompasslib/models/examples/cosine_similarity.ipynb deleted file mode 100644 index c9f3638..0000000 --- a/codecompasslib/models/examples/cosine_similarity.ipynb +++ /dev/null @@ -1,255 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "import pandas as pd\n", - "from typing import Tuple\n", - "from pandas import DataFrame\n", - "import numpy as np\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.metrics.pairwise import cosine_similarity\n", - "\n", - "sys.path.append('../../../')\n", - "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cosine Similairty Model\n", - "\n", - "This model utilizes the cosine similarity between the query and the documents to rank the documents. The cosine similarity is calculated as follows:\n", - "\n", - "- Using NLP and TFIDF, the repository, language and its description are tokenized and vectorized.\n", - "- The cosine similarity is calculated.\n", - "- The repos are ranked based on the cosine similarity." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n", - " \"\"\"\n", - " Load the data from the Google Drive\n", - " :return: The non-embedded and embedded datasets\n", - " \"\"\"\n", - " DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n", - " DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n", - "\n", - " creds = get_creds_drive()\n", - " df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n", - " df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n", - "\n", - " # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n", - " # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n", - " # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n", - "\n", - " print(\"Data loaded\")\n", - " return df_non_embedded, df_embedded" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", - "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n", - "\n", - "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def load_and_clean_data(df_non_embedded):\n", - " \"\"\"\n", - " Load and clean the dataset from a specified filepath.\n", - " \n", - " Args:\n", - " filepath (str): The file path to the dataset.\n", - "\n", - " Returns:\n", - " pandas.DataFrame: The cleaned DataFrame.\n", - " \"\"\"\n", - " # Load the data\n", - " df = df_non_embedded\n", - "\n", - " # Delete missing values\n", - " df.dropna(inplace=True)\n", - "\n", - " # Delete columns that are not needed\n", - " columns_to_drop = [\n", - " 'is_archived', 'is_disabled', 'is_template', 'has_projects', \n", - " 'owner_type', 'has_pages', 'has_wiki', \n", - " 'has_issues', 'has_downloads', 'is_fork'\n", - " ]\n", - " df.drop(columns=columns_to_drop, inplace=True)\n", - "\n", - " # Handling missing values in text columns\n", - " df['description'].fillna('', inplace=True)\n", - " df['name'].fillna('', inplace=True)\n", - " df['language'].fillna('', inplace=True)\n", - "\n", - " # Drop duplicates with name\n", - " df.drop_duplicates(subset='name', keep='first', inplace=True)\n", - "\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_cosine_similarity_scores(df):\n", - " \"\"\"\n", - " Calculate cosine similarity scores for the dataset.\n", - "\n", - " Args:\n", - " df (pandas.DataFrame): The DataFrame containing repository data.\n", - "\n", - " Returns:\n", - " tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n", - " \"\"\"\n", - " # Concatenating the text columns for vectorization\n", - " text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n", - "\n", - " # Vectorizing the text data using TF-IDF\n", - " tfidf_vectorizer = TfidfVectorizer()\n", - " tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n", - "\n", - " # Calculating cosine similarity\n", - " cosine_sim = cosine_similarity(tfidf_matrix)\n", - "\n", - " # Average the cosine similarities for each repo\n", - " similarity_scores = np.mean(cosine_sim, axis=1)\n", - "\n", - " # Adding the new column to the dataset\n", - " df['cosine_similarity_score'] = similarity_scores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def calculate_cosine_similarity_scores(df):\n", - " \"\"\"\n", - " Calculate cosine similarity scores for the dataset.\n", - "\n", - " Args:\n", - " df (pandas.DataFrame): The DataFrame containing repository data.\n", - "\n", - " Returns:\n", - " tuple: A tuple containing the DataFrame with added similarity scores and the TF-IDF vectorizer.\n", - " \"\"\"\n", - " # Concatenating the text columns for vectorization\n", - " text_data = df['name'] + \" \" + df['description'] + \" \" + df['language']\n", - "\n", - " # Vectorizing the text data using TF-IDF\n", - " tfidf_vectorizer = TfidfVectorizer()\n", - " tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)\n", - "\n", - " # Calculating cosine similarity\n", - " cosine_sim = cosine_similarity(tfidf_matrix)\n", - "\n", - " # Average the cosine similarities for each repo\n", - " similarity_scores = np.mean(cosine_sim, axis=1)\n", - "\n", - " # Adding the new column to the dataset\n", - " df['cosine_similarity_score'] = similarity_scores\n", - "\n", - " return df, tfidf_vectorizer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10):\n", - " \"\"\"\n", - " Recommend repositories based on user preferences.\n", - "\n", - " Args:\n", - " user_preference (str): The user's preferred keywords or phrases.\n", - " df (pandas.DataFrame): The DataFrame containing repository data.\n", - " tfidf_vectorizer (TfidfVectorizer): The TF-IDF vectorizer used for transforming text data.\n", - " top_n (int, optional): Number of top recommendations to return. Defaults to 10.\n", - "\n", - " Returns:\n", - " pandas.DataFrame: DataFrame containing top_n recommended repositories.\n", - " \"\"\"\n", - " # Vectorize the user preference\n", - " user_pref_vector = tfidf_vectorizer.transform([user_preference])\n", - "\n", - " # Calculate cosine similarity with all repositories\n", - " cosine_scores = cosine_similarity(user_pref_vector, tfidf_vectorizer.transform(df['name'] + \" \" + df['description'] + \" \" + df['language'])).flatten()\n", - "\n", - " # Get the indices of the repositories with the highest similarity scores\n", - " top_indices = np.argsort(cosine_scores)[-top_n:][::-1]\n", - "\n", - " # Select the top n recommended repositories\n", - " recommended_repos = df.iloc[top_indices].reset_index(drop=True)\n", - "\n", - " return recommended_repos[['name', 'description', 'language', 'cosine_similarity_score']]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def main(df):\n", - " \"\"\"\n", - " Main function to run the script.\n", - " \"\"\"\n", - " df, tfidf_vectorizer = calculate_cosine_similarity_scores(df)\n", - " user_preference = \"python\"\n", - " recommended_repos = recommend_repos(user_preference, df, tfidf_vectorizer, top_n=10)\n", - " print(recommended_repos)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From c1496c0556bd7a07fb3fa35c24b549013f3d595f Mon Sep 17 00:00:00 2001 From: KTsula Date: Tue, 13 Aug 2024 01:06:34 +0400 Subject: [PATCH 5/6] Added new users' repos and tested --- .gitignore | 4 +- .../embeddings/embeddings_helper_functions.py | 13 +- .../embeddings/generate_embedded_dataset.py | 4 +- .../models/examples/knn_model.ipynb | 312 +++++++++++++----- codecompasslib/models/lightgbm_model.py | 10 +- 5 files changed, 253 insertions(+), 90 deletions(-) diff --git a/.gitignore b/.gitignore index 58391e9..29fb4f2 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,9 @@ secrets/ **/__pycache__/ codecompasslib/API/datasets/**.csv codecompasslib/API/**.txt +codecompasslib/embeddings/**.csv dataset_new.csv codecompasslib/models/**.csv codecompasslib/models/examples/**.csv -codecompasslib/PretrainedModels/ \ No newline at end of file +codecompasslib/PretrainedModels/ +**.csv \ No newline at end of file diff --git a/codecompasslib/embeddings/embeddings_helper_functions.py b/codecompasslib/embeddings/embeddings_helper_functions.py index 8a13a2f..3471f3f 100644 --- a/codecompasslib/embeddings/embeddings_helper_functions.py +++ b/codecompasslib/embeddings/embeddings_helper_functions.py @@ -1,3 +1,14 @@ +import sys +import os + +# Construct the path to the root directory (one level up from embeddings) +root_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(root_dir) +real_project_dir = os.path.dirname(project_dir) + +# Add the project directory to the Python path +sys.path.insert(0, real_project_dir) + import numpy as np import pandas as pd from gensim.models.keyedvectors import KeyedVectors @@ -35,7 +46,7 @@ def load_word2vec_model(): Citation: Efstathiou Vasiliki, Chatzilenas Christos, & Spinellis Diomidis. (2018). Word Embeddings for the Software Engineering Domain [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1199620 """ - word_vect = KeyedVectors.load_word2vec_format("./codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True) + word_vect = KeyedVectors.load_word2vec_format("codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True) return word_vect # Vectorizing text using domain specific word2vec model diff --git a/codecompasslib/embeddings/generate_embedded_dataset.py b/codecompasslib/embeddings/generate_embedded_dataset.py index aaa05a7..ce8bd9a 100644 --- a/codecompasslib/embeddings/generate_embedded_dataset.py +++ b/codecompasslib/embeddings/generate_embedded_dataset.py @@ -90,13 +90,13 @@ def generate_openAI_embedded_csv(df, column_to_embed): # Save the current batch DataFrame to a CSV file # Mode 'a' is for append, header=False to avoid writing headers multiple times - batch_df.to_csv('df_embedded_0504_batch.csv', mode='a', header=not i, index=False) + batch_df.to_csv('df_embedded_1208_batch.csv', mode='a', header=not i, index=False) # Optional: Free up memory by deleting the batch DataFrame if no longer needed del batch_df # Load the CSV file with the embeddings - df_with_embeddings = pd.read_csv('df_embedded_0504_batch.csv') + df_with_embeddings = pd.read_csv('df_embedded_1208_batch.csv') return df_with_embeddings def main(): diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb index d3d31e6..9619501 100644 --- a/codecompasslib/models/examples/knn_model.ipynb +++ b/codecompasslib/models/examples/knn_model.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -54,109 +54,257 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Download 11%.\n", + "\n", + "Download 23%.\n", + "\n", + "Download 35%.\n", + "\n", + "Download 47%.\n", + "\n", + "Download 59%.\n", + "\n", + "Download 71%.\n", + "\n", + "Download 83%.\n", + "\n", + "Download 95%.\n", + "\n", + "Download 100%.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\API\\drive_operations.py:88: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " return read_csv(fh)\n" + ] + } + ], "source": [ - "def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]:\n", - " \"\"\"\n", - " Load the data from the Google Drive\n", - " :return: The non-embedded and embedded datasets\n", - " \"\"\"\n", - " DRIVE_ID = \"0AL1DtB4TdEWdUk9PVA\"\n", - " DATA_FOLDER = \"13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\"\n", - "\n", - " creds = get_creds_drive()\n", - " df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)\n", - " df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)\n", - "\n", - " # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data\n", - " # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')\n", - " # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')\n", + "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", "\n", - " print(\"Data loaded\")\n", - " return df_non_embedded, df_embedded" + "df_non_embedded = download_csv_as_pd_dataframe(creds=get_creds_drive(), file_id=full_data_folder_id)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "usecols=['owner_user', 'name', 'description', 'language']\n", + "# drop every column except for these\n", + "df = df_non_embedded.copy()\n", + "df = df[usecols]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, "metadata": {}, "outputs": [ { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m full_data_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 2\u001b[0m full_data_embedded_folder_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m139wi78iRzhwGZwxmI5WALoYocR-Rk9By\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m df_non_embedded, df_embedded \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfull_data_folder_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfull_data_embedded_folder_id\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[5], line 9\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m(full_data_folder_id, full_data_embedded_folder_id)\u001b[0m\n\u001b[1;32m 6\u001b[0m DRIVE_ID \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m0AL1DtB4TdEWdUk9PVA\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 7\u001b[0m DATA_FOLDER \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 9\u001b[0m creds \u001b[38;5;241m=\u001b[39m \u001b[43mget_creds_drive\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m df_non_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_folder_id)\n\u001b[1;32m 11\u001b[0m df_embedded: DataFrame \u001b[38;5;241m=\u001b[39m download_csv_as_pd_dataframe(creds\u001b[38;5;241m=\u001b[39mcreds, file_id\u001b[38;5;241m=\u001b[39mfull_data_embedded_folder_id)\n", - "File \u001b[0;32m~/VSCode/CodeCompass/codecompasslib/models/examples/../../../codecompasslib/API/drive_operations.py:24\u001b[0m, in \u001b[0;36mget_creds_drive\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_creds_drive\u001b[39m() \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Credentials:\n\u001b[1;32m 20\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;124;03m Get the credentials for the Google Drive API\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;124;03m :return: None\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m creds: Credentials \u001b[38;5;241m=\u001b[39m \u001b[43mCredentials\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_authorized_user_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mOUTER_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/secrets/token.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mSCOPES\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m creds \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mexpired \u001b[38;5;129;01mand\u001b[39;00m creds\u001b[38;5;241m.\u001b[39mrefresh_token:\n", - "File \u001b[0;32m~/VSCode/CodeCompass/.venv/lib/python3.11/site-packages/google/oauth2/credentials.py:537\u001b[0m, in \u001b[0;36mCredentials.from_authorized_user_file\u001b[0;34m(cls, filename, scopes)\u001b[0m\n\u001b[1;32m 521\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfrom_authorized_user_file\u001b[39m(\u001b[38;5;28mcls\u001b[39m, filename, scopes\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 523\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Creates a Credentials instance from an authorized user json file.\u001b[39;00m\n\u001b[1;32m 524\u001b[0m \n\u001b[1;32m 525\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;124;03m ValueError: If the file is not in the expected format.\u001b[39;00m\n\u001b[1;32m 536\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 537\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mio\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m json_file:\n\u001b[1;32m 538\u001b[0m data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(json_file)\n\u001b[1;32m 539\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39mfrom_authorized_user_info(data, scopes)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/Users/mirandadrummond/VSCode/CodeCompass/codecompasslib/models/examples/../../../secrets/token.json'" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
owner_usernamedescriptionlanguage
4Rameshwar0852Automation_ProjectAutomated Bash Script to automate log Backup g...Shell
18Rameshwar0852IKONCBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...Python
19Rameshwar0852javamavensonarrgohelmk8No descriptionHTML
23Rameshwar0852node001files_repoJavaScript
24Rameshwar0852nodeandjsnode java script applicationJavaScript
...............
2583820pinaxpinax-bloga blog app for DjangoPython
2583821montyloungedjango-mingusa Django blog engine leveraging reusable apps ...JavaScript
2583822WuXianglongGeekBlogA full blog system based on DjangoJavaScript
2583823NARKOZhacker-scriptsBased on a true storyJavaScript
2583824matthewbdalydjango_tutorial_blog_ngThe source for the new version of my Django tu...Python
\n", + "

1524223 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " owner_user name \\\n", + "4 Rameshwar0852 Automation_Project \n", + "18 Rameshwar0852 IKON \n", + "19 Rameshwar0852 javamavensonarrgohelmk8 \n", + "23 Rameshwar0852 node001 \n", + "24 Rameshwar0852 nodeandjs \n", + "... ... ... \n", + "2583820 pinax pinax-blog \n", + "2583821 montylounge django-mingus \n", + "2583822 WuXianglong GeekBlog \n", + "2583823 NARKOZ hacker-scripts \n", + "2583824 matthewbdaly django_tutorial_blog_ng \n", + "\n", + " description language \n", + "4 Automated Bash Script to automate log Backup g... Shell \n", + "18 CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION... Python \n", + "19 No description HTML \n", + "23 files_repo JavaScript \n", + "24 node java script application JavaScript \n", + "... ... ... \n", + "2583820 a blog app for Django Python \n", + "2583821 a Django blog engine leveraging reusable apps ... JavaScript \n", + "2583822 A full blog system based on Django JavaScript \n", + "2583823 Based on a true story JavaScript \n", + "2583824 The source for the new version of my Django tu... Python \n", + "\n", + "[1524223 rows x 4 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", - "full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'\n", - "\n", - "df_non_embedded, df_embedded = load_data(full_data_folder_id, full_data_embedded_folder_id)" + "df.dropna()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ - "def load_and_clean_data(df_non_embedded):\n", - " \"\"\"\n", - " Load and clean the dataset from a specified filepath.\n", - " \n", - " Args:\n", - " filepath (str): The file path to the dataset.\n", - "\n", - " Returns:\n", - " pandas.DataFrame: The cleaned DataFrame.\n", - " \"\"\"\n", - " # Load the data\n", - " df = df_non_embedded\n", - "\n", - " # Delete missing values\n", - " df.dropna(inplace=True)\n", - "\n", - " # Delete columns that are not needed\n", - " columns_to_drop = [\n", - " 'is_archived', 'is_disabled', 'is_template', 'has_projects', \n", - " 'owner_type', 'has_pages', 'has_wiki', \n", - " 'has_issues', 'has_downloads', 'is_fork'\n", - " ]\n", - " df.drop(columns=columns_to_drop, inplace=True)\n", - "\n", - " # Handling missing values in text columns\n", - " df['description'].fillna('', inplace=True)\n", - " df['name'].fillna('', inplace=True)\n", - " df['language'].fillna('', inplace=True)\n", - "\n", - " # Drop duplicates with name\n", - " df.drop_duplicates(subset='name', keep='first', inplace=True)\n", - "\n", - " return df" + "# convert language column to string type\n", + "df['language'] = df['language'].astype(str)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File not found.\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'vector_size'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[54], line 39\u001b[0m\n\u001b[0;32m 35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43membedded_user_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mname\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m 41\u001b[0m embedded_user_df\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\series.py:4908\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m 4780\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m 4781\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 4782\u001b[0m func: AggFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4787\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 4788\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m 4789\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 4790\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m 4791\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4906\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[0;32m 4907\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 4908\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 4909\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4910\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4911\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4912\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4913\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4914\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4915\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m 1747\u001b[0m )\n", + "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n", + "Cell \u001b[1;32mIn[54], line 39\u001b[0m, in \u001b[0;36m\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m 35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m 41\u001b[0m embedded_user_df\n", + "File \u001b[1;32mc:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\models\\model_diff_repos.py:60\u001b[0m, in \u001b[0;36mvectorize_text\u001b[1;34m(text, word_vect)\u001b[0m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvectorize_text\u001b[39m(text, word_vect):\n\u001b[0;32m 59\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m vector_sum \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(\u001b[43mword_vect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector_size\u001b[49m) \u001b[38;5;66;03m# Initialize an array to store the sum of word vectors\u001b[39;00m\n\u001b[0;32m 61\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;66;03m# Initialize a count to keep track of the number of words found in the vocabulary\u001b[39;00m\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit():\n", + "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'vector_size'" + ] + } + ], "source": [ - "df = load_and_clean_data(df_non_embedded)\n", - "\n", - "# count unique languges\n", - "df['language'].nunique()\n", - "\n", "# Create list of unique languages with _ prefix\n", "languages = ['_' + language for language in df['language'].unique()]\n", "\n", @@ -193,8 +341,10 @@ "embedded_user_df['name'] = user_df['name'].fillna('') \n", "embedded_user_df['description'] = user_df['description'].fillna('')\n", "\n", - "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(vectorize_text)\n", - "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(vectorize_text)\n", + "\n", + "word2vec_model = load_word2vec_model()\n", + "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(lambda x: vectorize_text(x, word2vec_model))\n", + "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(lambda x: vectorize_text(x, word2vec_model))\n", "embedded_user_df\n", "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)" ] @@ -256,7 +406,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py index 9d21174..7a1ba0e 100644 --- a/codecompasslib/models/lightgbm_model.py +++ b/codecompasslib/models/lightgbm_model.py @@ -128,13 +128,13 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu :return: The non-embedded and embedded datasets """ - creds = get_creds_drive() - df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id) - df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id) + # creds = get_creds_drive() + # df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id) + # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id) # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data - # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv') - # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv') + df_non_embedded = pd.read_csv('codecompasslib/models/data_full_new.csv') + df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined_new.csv') print("Data loaded") return df_non_embedded, df_embedded From ebc0d7bb4b16f3ba06178fdb8e295b645e4c0ff9 Mon Sep 17 00:00:00 2001 From: KTsula Date: Thu, 15 Aug 2024 02:41:30 +0400 Subject: [PATCH 6/6] Add caching mechanism with pickle --- codecompasslib/API/helper_functions.py | 29 +++++++++++++++++++++++ codecompasslib/models/lightgbm_model.py | 14 ++++++----- codecompasslib/recommendations_cache.pkl | Bin 0 -> 1180 bytes frontend/recommender/app.py | 12 +++++++--- 4 files changed, 46 insertions(+), 9 deletions(-) create mode 100644 codecompasslib/recommendations_cache.pkl diff --git a/codecompasslib/API/helper_functions.py b/codecompasslib/API/helper_functions.py index 4b04da0..dfab9c7 100644 --- a/codecompasslib/API/helper_functions.py +++ b/codecompasslib/API/helper_functions.py @@ -1,3 +1,5 @@ +import pickle +import os from json import load from pandas import DataFrame from os.path import dirname @@ -17,6 +19,33 @@ def save_to_csv(data: any, filename: str) -> None: df: DataFrame = DataFrame(data) df.to_csv(Path(PARENT_PATH + '/Data/' + filename), index=False) +def save_cache(cache_data: dict, cache_filename: str): + """ + Save a dictionary to a file in pickle format. + + :param cache_data: The dictionary to be saved. + :param cache_filename: The name of the file where the cache will be saved. + """ + with open(cache_filename, 'wb') as cache_file: + pickle.dump(cache_data, cache_file) + print(f"Cache saved to {cache_filename}") + +def load_cache(cache_filename: str) -> dict: + """ + Load a dictionary from a pickle file. + + :param cache_filename: The name of the file where the cache is stored. + :return: The loaded dictionary. + """ + if os.path.exists(cache_filename): + with open(cache_filename, 'rb') as cache_file: + cache_data = pickle.load(cache_file) + print(f"Cache loaded from {cache_filename}") + return cache_data + else: + print(f"No cache found at {cache_filename}") + return {} + def list_to_txt(data: list, file_name: str) -> bool: """ diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py index 7a1ba0e..7d64b7f 100644 --- a/codecompasslib/models/lightgbm_model.py +++ b/codecompasslib/models/lightgbm_model.py @@ -19,7 +19,7 @@ from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos - +from codecompasslib.API.helper_functions import save_cache, load_cache def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]: """ @@ -38,7 +38,6 @@ def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tupl del df[label_col] return df, y - def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Booster, ordinal.OrdinalEncoder]: """ Trains a LightGBM model using the provided merged dataframe. @@ -133,8 +132,8 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id) # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data - df_non_embedded = pd.read_csv('codecompasslib/models/data_full_new.csv') - df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined_new.csv') + df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv') + df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv') print("Data loaded") return df_non_embedded, df_embedded @@ -166,12 +165,13 @@ def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame, # Add target column: 1 if the repo is starred or owned by the user, else 0 owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]] starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]] + print("Owned length: ", len(owned_by_target_repo_ids)) + print("Starred length: ", len(starred_repo_ids)) starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0) return df_merged, starred_or_owned_by_user - def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFrame, df_embedded: DataFrame, number_of_recommendations: int = 10) -> list: """ @@ -217,5 +217,7 @@ def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFra else: counter += 1 recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index])) - + cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl') + cached_recommendations[target_user] = recommendations + save_cache(cached_recommendations, 'codecompasslib/recommendations_cache.pkl') return recommendations diff --git a/codecompasslib/recommendations_cache.pkl b/codecompasslib/recommendations_cache.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ef41e7b23debd1252c53b81c729f4ee2439a5634 GIT binary patch literal 1180 zcmaje&ui2`6bJBVyFaRK_0mHr9t*v=2!iz>*iw4g;!^c2@{&w4W0INeWM;QXu`CL< zg-{Sj_74yhgkDtqaa+83kscMnlP6EbYY&=TC+%hFyh8$$_sN%;&7vPg@h#b5LQNBOVfp#C&Y5sy2NN2gaC8Qe^?yjkj#&Tz literal 0 HcmV?d00001 diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py index 68b8cfc..754994f 100644 --- a/frontend/recommender/app.py +++ b/frontend/recommender/app.py @@ -13,6 +13,7 @@ # Import necessary functions from codecompasslib from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data +from codecompasslib.API.helper_functions import load_cache # Function to load cached data def load_cached_data(): @@ -24,6 +25,7 @@ def load_cached_data(): full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By' st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id) return st.session_state.cached_data + def main(): # Load the data @@ -41,9 +43,13 @@ def main(): if target_user not in df_embedded['owner_user'].values: st.error("User not found in the dataset. Please enter a valid username.") else: - # Generate recommendations - with st.spinner('Generating recommendations...'): - recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10) + cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl') + if target_user in cached_recommendations.keys(): + recommendations = cached_recommendations[target_user] + else: + # Generate recommendations + with st.spinner('Generating recommendations...'): + recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10) # Display recommendations st.subheader("Recommendations")