diff --git a/.gitignore b/.gitignore
index 58391e9..29fb4f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,9 @@ secrets/
**/__pycache__/
codecompasslib/API/datasets/**.csv
codecompasslib/API/**.txt
+codecompasslib/embeddings/**.csv
dataset_new.csv
codecompasslib/models/**.csv
codecompasslib/models/examples/**.csv
-codecompasslib/PretrainedModels/
\ No newline at end of file
+codecompasslib/PretrainedModels/
+**.csv
\ No newline at end of file
diff --git a/codecompasslib/API/helper_functions.py b/codecompasslib/API/helper_functions.py
index 4b04da0..dfab9c7 100644
--- a/codecompasslib/API/helper_functions.py
+++ b/codecompasslib/API/helper_functions.py
@@ -1,3 +1,5 @@
+import pickle
+import os
from json import load
from pandas import DataFrame
from os.path import dirname
@@ -17,6 +19,33 @@ def save_to_csv(data: any, filename: str) -> None:
df: DataFrame = DataFrame(data)
df.to_csv(Path(PARENT_PATH + '/Data/' + filename), index=False)
+def save_cache(cache_data: dict, cache_filename: str):
+ """
+ Save a dictionary to a file in pickle format.
+
+ :param cache_data: The dictionary to be saved.
+ :param cache_filename: The name of the file where the cache will be saved.
+ """
+ with open(cache_filename, 'wb') as cache_file:
+ pickle.dump(cache_data, cache_file)
+ print(f"Cache saved to {cache_filename}")
+
+def load_cache(cache_filename: str) -> dict:
+ """
+ Load a dictionary from a pickle file.
+
+ :param cache_filename: The name of the file where the cache is stored.
+ :return: The loaded dictionary.
+ """
+ if os.path.exists(cache_filename):
+ with open(cache_filename, 'rb') as cache_file:
+ cache_data = pickle.load(cache_file)
+ print(f"Cache loaded from {cache_filename}")
+ return cache_data
+ else:
+ print(f"No cache found at {cache_filename}")
+ return {}
+
def list_to_txt(data: list, file_name: str) -> bool:
"""
diff --git a/codecompasslib/embeddings/embeddings_helper_functions.py b/codecompasslib/embeddings/embeddings_helper_functions.py
index 8a13a2f..3471f3f 100644
--- a/codecompasslib/embeddings/embeddings_helper_functions.py
+++ b/codecompasslib/embeddings/embeddings_helper_functions.py
@@ -1,3 +1,14 @@
+import sys
+import os
+
+# Construct the path to the root directory (one level up from embeddings)
+root_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(root_dir)
+real_project_dir = os.path.dirname(project_dir)
+
+# Add the project directory to the Python path
+sys.path.insert(0, real_project_dir)
+
import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
@@ -35,7 +46,7 @@ def load_word2vec_model():
Citation:
Efstathiou Vasiliki, Chatzilenas Christos, & Spinellis Diomidis. (2018). Word Embeddings for the Software Engineering Domain [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1199620
"""
- word_vect = KeyedVectors.load_word2vec_format("./codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True)
+ word_vect = KeyedVectors.load_word2vec_format("codecompasslib/PretrainedModels/SO_vectors_200.bin", binary=True)
return word_vect
# Vectorizing text using domain specific word2vec model
diff --git a/codecompasslib/embeddings/generate_embedded_dataset.py b/codecompasslib/embeddings/generate_embedded_dataset.py
index aaa05a7..ce8bd9a 100644
--- a/codecompasslib/embeddings/generate_embedded_dataset.py
+++ b/codecompasslib/embeddings/generate_embedded_dataset.py
@@ -90,13 +90,13 @@ def generate_openAI_embedded_csv(df, column_to_embed):
# Save the current batch DataFrame to a CSV file
# Mode 'a' is for append, header=False to avoid writing headers multiple times
- batch_df.to_csv('df_embedded_0504_batch.csv', mode='a', header=not i, index=False)
+ batch_df.to_csv('df_embedded_1208_batch.csv', mode='a', header=not i, index=False)
# Optional: Free up memory by deleting the batch DataFrame if no longer needed
del batch_df
# Load the CSV file with the embeddings
- df_with_embeddings = pd.read_csv('df_embedded_0504_batch.csv')
+ df_with_embeddings = pd.read_csv('df_embedded_1208_batch.csv')
return df_with_embeddings
def main():
diff --git a/codecompasslib/models/cosine_similarity_model.py b/codecompasslib/models/cosine_similarity_model.py
index 9dafd97..e06d71c 100644
--- a/codecompasslib/models/cosine_similarity_model.py
+++ b/codecompasslib/models/cosine_similarity_model.py
@@ -39,6 +39,7 @@ def load_data(full_data_folder_id: str) -> DataFrame:
"""
creds = get_creds_drive()
df: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
+
return df
def clean_data(df: DataFrame) -> DataFrame:
diff --git a/codecompasslib/models/examples/knn_model.ipynb b/codecompasslib/models/examples/knn_model.ipynb
new file mode 100644
index 0000000..9619501
--- /dev/null
+++ b/codecompasslib/models/examples/knn_model.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "\n",
+ "import pandas as pd\n",
+ "from typing import Tuple\n",
+ "from pandas import DataFrame\n",
+ "import numpy as np\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "from sklearn.neighbors import NearestNeighbors\n",
+ "\n",
+ "sys.path.append('../../../')\n",
+ "from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive\n",
+ "from codecompasslib.models.model_diff_repos import load_word2vec_model, vectorize_text"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Word2Vec\n",
+ "Word2Vec is a method that converts words into numerical vectors, capturing information about their meaning based on the context in which they appear.\n",
+ "\n",
+ "Here’s how it works:\n",
+ "\n",
+ "Initialize a vector for each word randomly.\n",
+ "For each word in the corpus:\n",
+ "Predict the context words (words nearby) given the target word (skip-gram).\n",
+ "Adjust the word vectors to minimize the prediction error.\n",
+ "The learned vectors represent the words’ meanings. These vectors can be used for tasks like document similarity, text classification, and information retrieval.\n",
+ "\n",
+ "In our project we use pre-trained word2vec model, specifically trained on software engineering domain."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### K-Nearest Neighbors (KNN)\n",
+ "KNN is a supervised learning algorithm primarily used for classification based on the similarity of data points. It assumes that similar things tend to be close to each other in the feature space.\n",
+ "\n",
+ "Distance Metric: To measure similarity, we compute the distance between data points. Commonly used metrics include Euclidean distance, Manhattan distance, or cosine similarity.\n",
+ "Prediction: Given a new data point, find its K nearest neighbors based on the chosen distance metric.\n",
+ "In our project we use KNN to find users most similar to our target user."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Download 11%.\n",
+ "\n",
+ "Download 23%.\n",
+ "\n",
+ "Download 35%.\n",
+ "\n",
+ "Download 47%.\n",
+ "\n",
+ "Download 59%.\n",
+ "\n",
+ "Download 71%.\n",
+ "\n",
+ "Download 83%.\n",
+ "\n",
+ "Download 95%.\n",
+ "\n",
+ "Download 100%.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\API\\drive_operations.py:88: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " return read_csv(fh)\n"
+ ]
+ }
+ ],
+ "source": [
+ "full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd'\n",
+ "\n",
+ "df_non_embedded = download_csv_as_pd_dataframe(creds=get_creds_drive(), file_id=full_data_folder_id)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "usecols=['owner_user', 'name', 'description', 'language']\n",
+ "# drop every column except for these\n",
+ "df = df_non_embedded.copy()\n",
+ "df = df[usecols]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " owner_user | \n",
+ " name | \n",
+ " description | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 | \n",
+ " Rameshwar0852 | \n",
+ " Automation_Project | \n",
+ " Automated Bash Script to automate log Backup g... | \n",
+ " Shell | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Rameshwar0852 | \n",
+ " IKON | \n",
+ " CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION... | \n",
+ " Python | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Rameshwar0852 | \n",
+ " javamavensonarrgohelmk8 | \n",
+ " No description | \n",
+ " HTML | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Rameshwar0852 | \n",
+ " node001 | \n",
+ " files_repo | \n",
+ " JavaScript | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " Rameshwar0852 | \n",
+ " nodeandjs | \n",
+ " node java script application | \n",
+ " JavaScript | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2583820 | \n",
+ " pinax | \n",
+ " pinax-blog | \n",
+ " a blog app for Django | \n",
+ " Python | \n",
+ "
\n",
+ " \n",
+ " 2583821 | \n",
+ " montylounge | \n",
+ " django-mingus | \n",
+ " a Django blog engine leveraging reusable apps ... | \n",
+ " JavaScript | \n",
+ "
\n",
+ " \n",
+ " 2583822 | \n",
+ " WuXianglong | \n",
+ " GeekBlog | \n",
+ " A full blog system based on Django | \n",
+ " JavaScript | \n",
+ "
\n",
+ " \n",
+ " 2583823 | \n",
+ " NARKOZ | \n",
+ " hacker-scripts | \n",
+ " Based on a true story | \n",
+ " JavaScript | \n",
+ "
\n",
+ " \n",
+ " 2583824 | \n",
+ " matthewbdaly | \n",
+ " django_tutorial_blog_ng | \n",
+ " The source for the new version of my Django tu... | \n",
+ " Python | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1524223 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " owner_user name \\\n",
+ "4 Rameshwar0852 Automation_Project \n",
+ "18 Rameshwar0852 IKON \n",
+ "19 Rameshwar0852 javamavensonarrgohelmk8 \n",
+ "23 Rameshwar0852 node001 \n",
+ "24 Rameshwar0852 nodeandjs \n",
+ "... ... ... \n",
+ "2583820 pinax pinax-blog \n",
+ "2583821 montylounge django-mingus \n",
+ "2583822 WuXianglong GeekBlog \n",
+ "2583823 NARKOZ hacker-scripts \n",
+ "2583824 matthewbdaly django_tutorial_blog_ng \n",
+ "\n",
+ " description language \n",
+ "4 Automated Bash Script to automate log Backup g... Shell \n",
+ "18 CBIR(CONTENt BASED IMAGE RETRIVALE APPLICATION... Python \n",
+ "19 No description HTML \n",
+ "23 files_repo JavaScript \n",
+ "24 node java script application JavaScript \n",
+ "... ... ... \n",
+ "2583820 a blog app for Django Python \n",
+ "2583821 a Django blog engine leveraging reusable apps ... JavaScript \n",
+ "2583822 A full blog system based on Django JavaScript \n",
+ "2583823 Based on a true story JavaScript \n",
+ "2583824 The source for the new version of my Django tu... Python \n",
+ "\n",
+ "[1524223 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dropna()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# convert language column to string type\n",
+ "df['language'] = df['language'].astype(str)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File not found.\n"
+ ]
+ },
+ {
+ "ename": "AttributeError",
+ "evalue": "'NoneType' object has no attribute 'vector_size'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[1;32mIn[54], line 39\u001b[0m\n\u001b[0;32m 35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43membedded_user_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mname\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m 41\u001b[0m embedded_user_df\n",
+ "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\series.py:4908\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m 4780\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m 4781\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 4782\u001b[0m func: AggFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4787\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 4788\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m 4789\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 4790\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m 4791\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4906\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[0;32m 4907\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 4908\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 4909\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4910\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4911\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4912\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4913\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4914\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4915\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n",
+ "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[1;32mc:\\Users\\ketis\\anaconda3\\envs\\codecompassvenv\\lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m 1747\u001b[0m )\n",
+ "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n",
+ "Cell \u001b[1;32mIn[54], line 39\u001b[0m, in \u001b[0;36m\u001b[1;34m(x)\u001b[0m\n\u001b[0;32m 35\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mfillna(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 38\u001b[0m word2vec_model \u001b[38;5;241m=\u001b[39m load_word2vec_model()\n\u001b[1;32m---> 39\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mvectorize_text\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword2vec_model\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 40\u001b[0m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription_vector\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m embedded_user_df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdescription\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: vectorize_text(x, word2vec_model))\n\u001b[0;32m 41\u001b[0m embedded_user_df\n",
+ "File \u001b[1;32mc:\\Users\\ketis\\UniversityStuff\\2024\\RecAndChat\\CodeCompass\\codecompasslib\\models\\examples\\../../..\\codecompasslib\\models\\model_diff_repos.py:60\u001b[0m, in \u001b[0;36mvectorize_text\u001b[1;34m(text, word_vect)\u001b[0m\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mvectorize_text\u001b[39m(text, word_vect):\n\u001b[0;32m 59\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m vector_sum \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mzeros(\u001b[43mword_vect\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvector_size\u001b[49m) \u001b[38;5;66;03m# Initialize an array to store the sum of word vectors\u001b[39;00m\n\u001b[0;32m 61\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;66;03m# Initialize a count to keep track of the number of words found in the vocabulary\u001b[39;00m\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m word \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit():\n",
+ "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'vector_size'"
+ ]
+ }
+ ],
+ "source": [
+ "# Create list of unique languages with _ prefix\n",
+ "languages = ['_' + language for language in df['language'].unique()]\n",
+ "\n",
+ "# one hot encode the languages and don't include the language prefix\n",
+ "df = pd.get_dummies(df, columns=['language'], prefix='')\n",
+ "\n",
+ "# Turn df into a repo specific df with owner_user as a unique identifier, appending description and keeping 1 if any of the languages are present in at least one repo\n",
+ "\n",
+ "# Create a dictionary for aggregation\n",
+ "aggregation_dict = {\n",
+ " 'name': lambda x: list(x),\n",
+ " 'description': lambda x: list(x)\n",
+ "}\n",
+ "\n",
+ "# Add columns for languages\n",
+ "for lang in languages:\n",
+ " aggregation_dict[lang] = 'max'\n",
+ "\n",
+ "# Group by 'owner_user' and aggregate\n",
+ "user_df = df.groupby('owner_user').agg(aggregation_dict).reset_index()\n",
+ "\n",
+ "# Display the first few rows of the resulting DataFrame\n",
+ "user_df.head()\n",
+ "\n",
+ "# first we turn list of names and descriptions into a single string\n",
+ "user_df['name'] = user_df['name'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
+ "user_df['description'] = user_df['description'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else '')\n",
+ "user_df.head()\n",
+ "\n",
+ "word_vect = load_word2vec_model\n",
+ "\n",
+ "# Text preprocessing\n",
+ "embedded_user_df = user_df.copy()\n",
+ "embedded_user_df['name'] = user_df['name'].fillna('') \n",
+ "embedded_user_df['description'] = user_df['description'].fillna('')\n",
+ "\n",
+ "\n",
+ "word2vec_model = load_word2vec_model()\n",
+ "embedded_user_df['name_vector'] = embedded_user_df['name'].apply(lambda x: vectorize_text(x, word2vec_model))\n",
+ "embedded_user_df['description_vector'] = embedded_user_df['description'].apply(lambda x: vectorize_text(x, word2vec_model))\n",
+ "embedded_user_df\n",
+ "# embedded_user_df.drop(['name', 'description', 'owner_user'], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Transform df into something that KNN can use. To be more specific, into a feature matrix\n",
+ "# Create a list of all the vectors\n",
+ "vectors = []\n",
+ "repo_df = embedded_user_df * 1 # convert all boolean values in repo_df to 0 or 1\n",
+ "\n",
+ "for row in repo_df.index: \n",
+ " vector = []\n",
+ " for columns in ['name_vector', 'description_vector']:\n",
+ " if type(repo_df.at[row, columns]) == np.ndarray:\n",
+ " for element in repo_df.at[row, columns]:\n",
+ " vector.append(element)\n",
+ " else: vector.append(repo_df.at[row, columns])\n",
+ " vectors.append(vector)\n",
+ "\n",
+ " # Train Nearest Neighbors Model\n",
+ "k = 5 # Number of neighbors to find\n",
+ "nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')\n",
+ "nn_model.fit(vectors)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Example Usage\n",
+ "\n",
+ "target_user = 21\n",
+ "# neighbors excluding the target user\n",
+ "neighbors = nn_model.kneighbors([vectors[target_user]], return_distance=False)[0][1:]\n",
+ "neighbors"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py
index 9d21174..7d64b7f 100644
--- a/codecompasslib/models/lightgbm_model.py
+++ b/codecompasslib/models/lightgbm_model.py
@@ -19,7 +19,7 @@
from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive
from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos
-
+from codecompasslib.API.helper_functions import save_cache, load_cache
def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]:
"""
@@ -38,7 +38,6 @@ def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tupl
del df[label_col]
return df, y
-
def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Booster, ordinal.OrdinalEncoder]:
"""
Trains a LightGBM model using the provided merged dataframe.
@@ -128,13 +127,13 @@ def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tu
:return: The non-embedded and embedded datasets
"""
- creds = get_creds_drive()
- df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
- df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
+ # creds = get_creds_drive()
+ # df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id)
+ # df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id)
# Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data
- # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
- # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')
+ df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv')
+ df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv')
print("Data loaded")
return df_non_embedded, df_embedded
@@ -166,12 +165,13 @@ def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame,
# Add target column: 1 if the repo is starred or owned by the user, else 0
owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]]
starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]]
+ print("Owned length: ", len(owned_by_target_repo_ids))
+ print("Starred length: ", len(starred_repo_ids))
starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids
df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0)
return df_merged, starred_or_owned_by_user
-
def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFrame,
df_embedded: DataFrame, number_of_recommendations: int = 10) -> list:
"""
@@ -217,5 +217,7 @@ def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFra
else:
counter += 1
recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index]))
-
+ cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl')
+ cached_recommendations[target_user] = recommendations
+ save_cache(cached_recommendations, 'codecompasslib/recommendations_cache.pkl')
return recommendations
diff --git a/codecompasslib/recommendations_cache.pkl b/codecompasslib/recommendations_cache.pkl
new file mode 100644
index 0000000..ef41e7b
Binary files /dev/null and b/codecompasslib/recommendations_cache.pkl differ
diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py
index 68b8cfc..754994f 100644
--- a/frontend/recommender/app.py
+++ b/frontend/recommender/app.py
@@ -13,6 +13,7 @@
# Import necessary functions from codecompasslib
from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data
+from codecompasslib.API.helper_functions import load_cache
# Function to load cached data
def load_cached_data():
@@ -24,6 +25,7 @@ def load_cached_data():
full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By'
st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id)
return st.session_state.cached_data
+
def main():
# Load the data
@@ -41,9 +43,13 @@ def main():
if target_user not in df_embedded['owner_user'].values:
st.error("User not found in the dataset. Please enter a valid username.")
else:
- # Generate recommendations
- with st.spinner('Generating recommendations...'):
- recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)
+ cached_recommendations = load_cache('codecompasslib/recommendations_cache.pkl')
+ if target_user in cached_recommendations.keys():
+ recommendations = cached_recommendations[target_user]
+ else:
+ # Generate recommendations
+ with st.spinner('Generating recommendations...'):
+ recommendations = generate_lightGBM_recommendations(target_user, df_non_embedded, df_embedded, number_of_recommendations=10)
# Display recommendations
st.subheader("Recommendations")