diff --git a/.gitignore b/.gitignore index 58391e9..29fb4f2 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,9 @@ secrets/ **/__pycache__/ codecompasslib/API/datasets/**.csv codecompasslib/API/**.txt +codecompasslib/embeddings/**.csv dataset_new.csv codecompasslib/models/**.csv codecompasslib/models/examples/**.csv -codecompasslib/PretrainedModels/ \ No newline at end of file +codecompasslib/PretrainedModels/ +**.csv \ No newline at end of file diff --git a/codecompasslib/API/helper_functions.py b/codecompasslib/API/helper_functions.py index 4b04da0..dfab9c7 100644 --- a/codecompasslib/API/helper_functions.py +++ b/codecompasslib/API/helper_functions.py @@ -1,3 +1,5 @@ +import pickle +import os from json import load from pandas import DataFrame from os.path import dirname @@ -17,6 +19,33 @@ def save_to_csv(data: any, filename: str) -> None: df: DataFrame = DataFrame(data) df.to_csv(Path(PARENT_PATH + '/Data/' + filename), index=False) +def save_cache(cache_data: dict, cache_filename: str): + """ + Save a dictionary to a file in pickle format. + + :param cache_data: The dictionary to be saved. + :param cache_filename: The name of the file where the cache will be saved. + """ + with open(cache_filename, 'wb') as cache_file: + pickle.dump(cache_data, cache_file) + print(f"Cache saved to {cache_filename}") + +def load_cache(cache_filename: str) -> dict: + """ + Load a dictionary from a pickle file. + + :param cache_filename: The name of the file where the cache is stored. + :return: The loaded dictionary. + """ + if os.path.exists(cache_filename): + with open(cache_filename, 'rb') as cache_file: + cache_data = pickle.load(cache_file) + print(f"Cache loaded from {cache_filename}") + return cache_data + else: + print(f"No cache found at {cache_filename}") + return {} + def list_to_txt(data: list, file_name: str) -> bool: """ diff --git a/codecompasslib/embeddings/embeddings_helper_functions.py b/codecompasslib/embeddings/embeddings_helper_functions.py index 8a13a2f..3471f3f 100644 --- a/codecompasslib/embeddings/embeddings_helper_functions.py +++ b/codecompasslib/embeddings/embeddings_helper_functions.py @@ -1,3 +1,14 @@ +import sys +import os + +# Construct the path to the root directory (one level up from embeddings) +root_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(root_dir) +real_project_dir = os.path.dirname(project_dir) + +# Add the project directory to the Python path +sys.path.insert(0, real_project_dir) + import numpy as np import pandas as pd from gensim.models.keyedvectors import KeyedVectors @@ -35,7 +46,7 @@ def load_word2vec_model(): Citation: Efstathiou Vasiliki, Chatzilenas Christos, & Spinellis Diomidis. (2018). Word Embeddings for the Software Engineering Domain [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1199620 """ - word_vect = KeyedVectors.load_word2vec_format("./codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True) + word_vect = KeyedVectors.load_word2vec_format("codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True) return word_vect # Vectorizing text using domain specific word2vec model diff --git a/codecompasslib/embeddings/generate_embedded_dataset.py b/codecompasslib/embeddings/generate_embedded_dataset.py index aaa05a7..ce8bd9a 100644 --- a/codecompasslib/embeddings/generate_embedded_dataset.py +++ b/codecompasslib/embeddings/generate_embedded_dataset.py @@ -90,13 +90,13 @@ def generate_openAI_embedded_csv(df, column_to_embed): # Save the current batch DataFrame to a CSV file # Mode 'a' is for append, header=False to avoid writing headers multiple times - batch_df.to_csv('df_embedded_0504_batch.csv', mode='a', header=not i, index=False) + batch_df.to_csv('df_embedded_1208_batch.csv', mode='a', header=not i, index=False) # Optional: Free up memory by deleting the batch DataFrame if no longer needed del batch_df # Load the CSV file with the embeddings - df_with_embeddings = pd.read_csv('df_embedded_0504_batch.csv') + df_with_embeddings = pd.read_csv('df_embedded_1208_batch.csv') return df_with_embeddings def main(): diff --git a/codecompasslib/models/cosine_similarity_model.py b/codecompasslib/models/cosine_similarity_model.py index 9dafd97..e06d71c 100644 --- a/codecompasslib/models/cosine_similarity_model.py +++ b/codecompasslib/models/cosine_similarity_model.py @@ -39,6 +39,7 @@ def load_data(full_data_folder_id: str) -> DataFrame: """ creds = get_creds_drive() df: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id) + return df def clean_data(df: DataFrame) -> DataFrame: diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb new file mode 100644 index 0000000..9619501 --- /dev/null +++ b/codecompasslib/models/examples/knn_model.ipynb @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "import pandas as pd\n", + "from typing import Tuple\n", + "from pandas import DataFrame\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from sklearn.neighbors import NearestNeighbors\n", + "\n", + "sys.path.append('../../../')\n", + "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n", + "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word2Vec\n", + "Word2Vec is a method that converts words into numerical vectors, capturing information about their meaning based on the context in which they appear.\n", + "\n", + "Here’s how it works:\n", + "\n", + "Initialize a vector for each word randomly.\n", + "For each word in the corpus:\n", + "Predict the context words (words nearby) given the target word (skip-gram).\n", + "Adjust the word vectors to minimize the prediction error.\n", + "The learned vectors represent the words’ meanings. These vectors can be used for tasks like document similarity, text classification, and information retrieval.\n", + "\n", + "In our project we use pre-trained word2vec model, specifically trained on software engineering domain." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### K-Nearest Neighbors (KNN)\n", + "KNN is a supervised learning algorithm primarily used for classification based on the similarity of data points. It assumes that similar things tend to be close to each other in the feature space.\n", + "\n", + "Distance Metric: To measure similarity, we compute the distance between data points. Commonly used metrics include Euclidean distance, Manhattan distance, or cosine similarity.\n", + "Prediction: Given a new data point, find its K nearest neighbors based on the chosen distance metric.\n", + "In our project we use KNN to find users most similar to our target user." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Download 11%.\n", + "\n", + "Download 23%.\n", + "\n", + "Download 35%.\n", + "\n", + "Download 47%.\n", + "\n", + "Download 59%.\n", + "\n", + "Download 71%.\n", + "\n", + "Download 83%.\n", + "\n", + "Download 95%.\n", + "\n", + "Download 100%.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\API\\drive_operations.py:88: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " return read_csv(fh)\n" + ] + } + ], + "source": [ + "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n", + "\n", + "df_non_embedded = download_csv_as_pd_dataframe(creds=get_creds_drive(), file_id=full_data_folder_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "usecols=['owner_user', 'name', 'description', 'language']\n", + "# drop every column except for these\n", + "df = df_non_embedded.copy()\n", + "df = df[usecols]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
owner_usernamedescriptionlanguage
4Rameshwar0852Automation_ProjectAutomated Bash Script to automate log Backup g...Shell
18Rameshwar0852IKONCBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION...Python
19Rameshwar0852javamavensonarrgohelmk8No descriptionHTML
23Rameshwar0852node001files_repoJavaScript
24Rameshwar0852nodeandjsnode java script applicationJavaScript
...............
2583820pinaxpinax-bloga blog app for DjangoPython
2583821montyloungedjango-mingusa Django blog engine leveraging reusable apps ...JavaScript
2583822WuXianglongGeekBlogA full blog system based on DjangoJavaScript
2583823NARKOZhacker-scriptsBased on a true storyJavaScript
2583824matthewbdalydjango_tutorial_blog_ngThe source for the new version of my Django tu...Python
\n", + "

1524223 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " owner_user name \\\n", + "4 Rameshwar0852 Automation_Project \n", + "18 Rameshwar0852 IKON \n", + "19 Rameshwar0852 javamavensonarrgohelmk8 \n", + "23 Rameshwar0852 node001 \n", + "24 Rameshwar0852 nodeandjs \n", + "... ... ... \n", + "2583820 pinax pinax-blog \n", + "2583821 montylounge django-mingus \n", + "2583822 WuXianglong GeekBlog \n", + "2583823 NARKOZ hacker-scripts \n", + "2583824 matthewbdaly django_tutorial_blog_ng \n", + "\n", + " description language \n", + "4 Automated Bash Script to automate log Backup g... Shell \n", + "18 CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION... Python \n", + "19 No description HTML \n", + "23 files_repo JavaScript \n", + "24 node java script application JavaScript \n", + "... ... ... \n", + "2583820 a blog app for Django Python \n", + "2583821 a Django blog engine leveraging reusable apps ... JavaScript \n", + "2583822 A full blog system based on Django JavaScript \n", + "2583823 Based on a true story JavaScript \n", + "2583824 The source for the new version of my Django tu... Python \n", + "\n", + "[1524223 rows x 4 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# convert language column to string type\n", + "df['language'] = df['language'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File not found.\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'vector_size'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[54], line 39\u001b[0m\n\u001b[0;32m 35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43membedded_user_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mname\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m 41\u001b[0m embedded_user_df\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\series.py:4908\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m 4780\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m 4781\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 4782\u001b[0m func: AggFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4787\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 4788\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m 4789\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 4790\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m 4791\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4906\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[0;32m 4907\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 4908\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 4909\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4910\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4911\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4912\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4913\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4914\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4915\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m 1747\u001b[0m )\n", + "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n", + "Cell \u001b[1;32mIn[54], line 39\u001b[0m, in \u001b[0;36m\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m 35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m 41\u001b[0m embedded_user_df\n", + "File \u001b[1;32mc:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\models\\model_diff_repos.py:60\u001b[0m, in \u001b[0;36mvectorize_text\u001b[1;34m(text, word_vect)\u001b[0m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvectorize_text\u001b[39m(text, word_vect):\n\u001b[0;32m 59\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m vector_sum \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(\u001b[43mword_vect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector_size\u001b[49m) \u001b[38;5;66;03m# Initialize an array to store the sum of word vectors\u001b[39;00m\n\u001b[0;32m 61\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;66;03m# Initialize a count to keep track of the number of words found in the vocabulary\u001b[39;00m\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit():\n", + "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'vector_size'" + ] + } + ], + "source": [ + "# Create list of unique languages with _ prefix\n", + "languages = ['_' + language for language in df['language'].unique()]\n", + "\n", + "# one hot encode the languages and don't include the language prefix\n", + "df = pd.get_dummies(df, columns=['language'], prefix='')\n", + "\n", + "# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo\n", + "\n", + "# Create a dictionary for aggregation\n", + "aggregation_dict = {\n", + " 'name': lambda x: list(x),\n", + " 'description': lambda x: list(x)\n", + "}\n", + "\n", + "# Add columns for languages\n", + "for lang in languages:\n", + " aggregation_dict[lang] = 'max'\n", + "\n", + "# Group by 'owner_user' and aggregate\n", + "user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()\n", + "\n", + "# Display the first few rows of the resulting DataFrame\n", + "user_df.head()\n", + "\n", + "# first we turn list of names and descriptions into a single string\n", + "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n", + "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n", + "user_df.head()\n", + "\n", + "word_vect = load_word2vec_model\n", + "\n", + "# Text preprocessing\n", + "embedded_user_df = user_df.copy()\n", + "embedded_user_df['name'] = user_df['name'].fillna('') \n", + "embedded_user_df['description'] = user_df['description'].fillna('')\n", + "\n", + "\n", + "word2vec_model = load_word2vec_model()\n", + "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(lambda x: vectorize_text(x, word2vec_model))\n", + "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(lambda x: vectorize_text(x, word2vec_model))\n", + "embedded_user_df\n", + "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform df into something that KNN can use. To be more specific, into a feature matrix\n", + "# Create a list of all the vectors\n", + "vectors = []\n", + "repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1\n", + "\n", + "for row in repo_df.index: \n", + " vector = []\n", + " for columns in ['name_vector', 'description_vector']:\n", + " if type(repo_df.at[row, columns]) == np.ndarray:\n", + " for element in repo_df.at[row, columns]:\n", + " vector.append(element)\n", + " else: vector.append(repo_df.at[row, columns])\n", + " vectors.append(vector)\n", + "\n", + " # Train Nearest Neighbors Model\n", + "k = 5 # Number of neighbors to find\n", + "nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')\n", + "nn_model.fit(vectors)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example Usage\n", + "\n", + "target_user = 21\n", + "# neighbors excluding the target user\n", + "neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]\n", + "neighbors" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py index 9d21174..7d64b7f 100644 --- a/codecompasslib/models/lightgbm_model.py +++ b/codecompasslib/models/lightgbm_model.py @@ -19,7 +19,7 @@ from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos - +from codecompasslib.API.helper_functions import save_cache, load_cache def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]: """ @@ -38,7 +38,6 @@ def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tupl del df[label_col] return df, y - def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Booster, ordinal.OrdinalEncoder]: """ Trains a LightGBM model using the provided merged dataframe. @@ -128,13 +127,13 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu :return: The non-embedded and embedded datasets """ - creds = get_creds_drive() - df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id) - df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id) + # creds = get_creds_drive() + # df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id) + # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id) # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data - # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv') - # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv') + df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv') + df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv') print("Data loaded") return df_non_embedded, df_embedded @@ -166,12 +165,13 @@ def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame, # Add target column: 1 if the repo is starred or owned by the user, else 0 owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]] starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]] + print("Owned length: ", len(owned_by_target_repo_ids)) + print("Starred length: ", len(starred_repo_ids)) starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0) return df_merged, starred_or_owned_by_user - def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFrame, df_embedded: DataFrame, number_of_recommendations: int = 10) -> list: """ @@ -217,5 +217,7 @@ def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFra else: counter += 1 recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index])) - + cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl') + cached_recommendations[target_user] = recommendations + save_cache(cached_recommendations, 'codecompasslib/recommendations_cache.pkl') return recommendations diff --git a/codecompasslib/recommendations_cache.pkl b/codecompasslib/recommendations_cache.pkl new file mode 100644 index 0000000..ef41e7b Binary files /dev/null and b/codecompasslib/recommendations_cache.pkl differ diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py index 68b8cfc..754994f 100644 --- a/frontend/recommender/app.py +++ b/frontend/recommender/app.py @@ -13,6 +13,7 @@ # Import necessary functions from codecompasslib from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data +from codecompasslib.API.helper_functions import load_cache # Function to load cached data def load_cached_data(): @@ -24,6 +25,7 @@ def load_cached_data(): full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By' st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id) return st.session_state.cached_data + def main(): # Load the data @@ -41,9 +43,13 @@ def main(): if target_user not in df_embedded['owner_user'].values: st.error("User not found in the dataset. Please enter a valid username.") else: - # Generate recommendations - with st.spinner('Generating recommendations...'): - recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10) + cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl') + if target_user in cached_recommendations.keys(): + recommendations = cached_recommendations[target_user] + else: + # Generate recommendations + with st.spinner('Generating recommendations...'): + recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10) # Display recommendations st.subheader("Recommendations")