diff --git a/.gitignore b/.gitignore index 58391e9..94f7230 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,8 @@ secrets/ **/__pycache__/ codecompasslib/API/datasets/**.csv codecompasslib/API/**.txt -dataset_new.csv codecompasslib/models/**.csv codecompasslib/models/examples/**.csv -codecompasslib/PretrainedModels/ \ No newline at end of file +codecompasslib/PretrainedModels/ +dump.rdb +data/* diff --git a/codecompasslib/API/drive_operations.py b/codecompasslib/API/drive_old/drive_operations.py similarity index 100% rename from codecompasslib/API/drive_operations.py rename to codecompasslib/API/drive_old/drive_operations.py diff --git a/codecompasslib/API/helper_functions.py b/codecompasslib/API/helper_functions.py index 4b04da0..178a016 100644 --- a/codecompasslib/API/helper_functions.py +++ b/codecompasslib/API/helper_functions.py @@ -15,7 +15,7 @@ def save_to_csv(data: any, filename: str) -> None: :return: Does not return anything. """ df: DataFrame = DataFrame(data) - df.to_csv(Path(PARENT_PATH + '/Data/' + filename), index=False) + df.to_csv(Path(PARENT_PATH + '/data/' + filename), index=False) def list_to_txt(data: list, file_name: str) -> bool: diff --git a/codecompasslib/API/redis_operations.py b/codecompasslib/API/redis_operations.py new file mode 100644 index 0000000..fbbcde1 --- /dev/null +++ b/codecompasslib/API/redis_operations.py @@ -0,0 +1,154 @@ +#ADD ARGUMENT HERE FOR EMBEDDED / NON EMBEDDED WHEN IMPLEMENTING REDIS FOR BOTH DATASETS +import json +import sys +import os +from redis import Redis +from pandas import DataFrame, concat, read_csv +from numpy import vstack + + +# Redis client constants +REDIS_HOST = 'localhost' +REDIS_PORT = 6379 +REDIS_DB = 0 + +#Initialize Redis client +redis_client = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True) + +def redis_to_dataframe() -> DataFrame: + """ + Retrieves embedded datasets from Redis and converts them into a DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing 'id' and 'embedding' columns. + """ + embedded_data = [] + + # Fetch all keys matching the pattern "embedded:*" + redis_keys = redis_client.keys('embedded:*') + + for key in redis_keys: + # Decode the key from bytes to string + key_str = key + + # Get the corresponding embedding vector + embedded_vector = redis_client.get(key_str) + + if embedded_vector: + embedding_list = json.loads(embedded_vector) # Convert from JSON string to list + repository_id = key_str.split(":")[1] # Extracting the repository ID from the key + embedded_data.append({'id': float(repository_id), 'embedding': embedding_list}) + + # Create a DataFrame from the collected embedded data + df_embed = DataFrame(embedded_data) + df_embed['id'] = df_embed['id'].astype(float) + + embedding_array = vstack(df_embed['embedding'].values) + + df_embeddings = DataFrame(embedding_array) + df_embeddings.columns = [f"embedding_{i}" for i in range(df_embeddings.shape[1])] + df_embeddings = df_embeddings.astype(float) + + df_embedded = concat([df_embed[['id']], df_embeddings], axis=1) + + return df_embedded + +def load_non_embedded_data(fname: str) -> DataFrame: + """ + Load non-embedded data from a local CSV file. + :param file_path: Path to the non-embedded CSV file. + :return: DataFrame containing non-embedded data. + """ + root_dir = os.path.dirname(os.path.abspath(__file__)) + + project_dir = os.path.dirname(root_dir) + real_project_dir = os.path.dirname(project_dir) + # Add the project directory to the Python path + sys.path.insert(0, real_project_dir) + datafolder = real_project_dir + '/data/' + + df_non_embedded = read_csv(datafolder + fname) + return df_non_embedded + + +def save_redis_to_json(file_path='redis_data.json'): + """ + Save all Redis data to a JSON file. + + Parameters: + - file_path (str): The path to the JSON file where data will be saved. + """ + # Get all keys + keys = redis_client.keys('*') # Use '*' to match all keys + print(f"Number of keys: {len(keys)}") + + # Prepare a dictionary to hold all key-value pairs + data_dict = {} + + for key in keys: + print(f"KEY: {key}") + print("Data type:", redis_client.type(key)) + value = redis_client.get(key) # Adjust this function according to the Redis type, e.g., get, hgetall + + # Store in the dictionary with value handling + data_dict[key] = value + + # Write to a JSON file + with open(file_path, 'w') as json_file: + json.dump(data_dict, json_file, indent=2, ensure_ascii=False) + + print(f"Data saved to {file_path}") + +def load_json_to_redis(file_path='redis_data.json', host='localhost', port=6379, db=0): + """ + Load data from a JSON file into a Redis database. + + Parameters: + - file_path (str): The path to the JSON file to be loaded. + - host (str): The Redis server hostname. + - port (int): The Redis server port. + - db (int): The Redis database number. + """ + + # Open the JSON file and load its data + with open(file_path, 'r', encoding='utf-8') as json_file: + data_dict = json.load(json_file) + + # Iterate over each key-value pair in the loaded data and save them in Redis + for key, value in data_dict.items(): + if value is not None: + redis_client.set(key, value) + + print(f"Data loaded into Redis from {file_path}") + +def load_csv_to_redis(fname="df_embedded_combined"): + """ + Load data from a CSV file into a Redis database. + + Parameters: + - fname (str): The name of the CSV file to be loaded (assumes it ends with '.csv'). + """ + path = datafolder + fname + '.csv' + print("Loading from:", path) + + # Read the CSV file into a pandas DataFrame + df = read_csv(path) + + # Make sure to create the embeddings_columns dynamically based on the data + embedding_columns = [col for col in df.columns if col.startswith("embedding_")] + + # Store each embedding in Redis + for index, row in df.iterrows(): + redis_key = f"embedded:{row['id']}" # Use repository ID as the Redis key + # Convert the embedding columns to a list and store as a JSON string + redis_client.set(redis_key, json.dumps(row[embedding_columns].tolist())) + if index % 10000 == 0: + print(f"Stored {index} embeddings in Redis") + + print(f"Data loaded into Redis from {fname}.csv") + +if __name__ == "__main__": + #save_redis_to_json('redis_embedded.json') + #load_json_to_redis('redis_embedded.json') + #load_csv_to_redis() + pass \ No newline at end of file diff --git a/codecompasslib/API/testing_redis.ipynb b/codecompasslib/API/testing_redis.ipynb new file mode 100644 index 0000000..45407c9 --- /dev/null +++ b/codecompasslib/API/testing_redis.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from redis_operations import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception ignored in: >\n", + "Traceback (most recent call last):\n", + " File \"/Users/maudhelenhovland/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py\", line 775, in _clean_thread_parent_frames\n", + " def _clean_thread_parent_frames(\n", + "KeyboardInterrupt: \n" + ] + } + ], + "source": [ + "df = redis_to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maudhelenhovland/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/codecompasslib/API/redis_operations.py:70: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df_non_embedded = read_csv(datafolder + fname)\n" + ] + } + ], + "source": [ + "non_embedded_df = load_non_embedded_data(\"data_full.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameowner_userowner_typedescriptionurlis_forkdate_createddate_updateddate_pushed...has_wikihas_pageshas _discussionsnum_forksis_archivedis_disabledis_templatelicenseopen_issuestopics
0444741024.0doc.aurora.dev-develop-compat-evmmercyogUserNo descriptionhttps://api.github.com/repos/mercyog/doc.auror...False2022-01-05T09:24:59Z2023-03-05T00:05:37Z2022-01-05T09:24:59Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
1404812692.0White_PapermercyogUserWhite Paper for Choice Coinhttps://api.github.com/repos/mercyog/White_PaperTrue2021-09-09T17:30:02Z2023-03-05T00:05:37Z2021-09-09T00:41:10Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
2462013111.0Advance-SQL-AssignmentRameshwar0852UserAdvance SQL assignment.https://api.github.com/repos/Rameshwar0852/Adv...True2022-02-21T20:01:28Z2024-03-09T14:13:51Z2020-08-05T20:54:22Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
3462013123.0anosqlRameshwar0852UserEasy SQL in Pythonhttps://api.github.com/repos/Rameshwar0852/anosqlTrue2022-02-21T20:01:29Z2024-03-09T14:13:51Z2020-09-09T18:47:09Z...TrueFalseFalse0FalseFalseFalseOther0.0[]
4585055299.0Automation_ProjectRameshwar0852UserAutomated Bash Script to automate log Backup g...https://api.github.com/repos/Rameshwar0852/Aut...False2023-01-04T07:49:35Z2024-03-09T14:13:42Z2023-01-04T17:15:46Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " id name owner_user owner_type \\\n", + "0 444741024.0 doc.aurora.dev-develop-compat-evm mercyog User \n", + "1 404812692.0 White_Paper mercyog User \n", + "2 462013111.0 Advance-SQL-Assignment Rameshwar0852 User \n", + "3 462013123.0 anosql Rameshwar0852 User \n", + "4 585055299.0 Automation_Project Rameshwar0852 User \n", + "\n", + " description \\\n", + "0 No description \n", + "1 White Paper for Choice Coin \n", + "2 Advance SQL assignment. \n", + "3 Easy SQL in Python \n", + "4 Automated Bash Script to automate log Backup g... \n", + "\n", + " url is_fork \\\n", + "0 https://api.github.com/repos/mercyog/doc.auror... False \n", + "1 https://api.github.com/repos/mercyog/White_Paper True \n", + "2 https://api.github.com/repos/Rameshwar0852/Adv... True \n", + "3 https://api.github.com/repos/Rameshwar0852/anosql True \n", + "4 https://api.github.com/repos/Rameshwar0852/Aut... False \n", + "\n", + " date_created date_updated date_pushed ... \\\n", + "0 2022-01-05T09:24:59Z 2023-03-05T00:05:37Z 2022-01-05T09:24:59Z ... \n", + "1 2021-09-09T17:30:02Z 2023-03-05T00:05:37Z 2021-09-09T00:41:10Z ... \n", + "2 2022-02-21T20:01:28Z 2024-03-09T14:13:51Z 2020-08-05T20:54:22Z ... \n", + "3 2022-02-21T20:01:29Z 2024-03-09T14:13:51Z 2020-09-09T18:47:09Z ... \n", + "4 2023-01-04T07:49:35Z 2024-03-09T14:13:42Z 2023-01-04T17:15:46Z ... \n", + "\n", + " has_wiki has_pages has _discussions num_forks is_archived is_disabled \\\n", + "0 True False False 0 False False \n", + "1 True False False 0 False False \n", + "2 True False False 0 False False \n", + "3 True False False 0 False False \n", + "4 True False False 0 False False \n", + "\n", + " is_template license open_issues topics \n", + "0 False No license 0.0 [] \n", + "1 False No license 0.0 [] \n", + "2 False No license 0.0 [] \n", + "3 False Other 0.0 [] \n", + "4 False No license 0.0 [] \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_embedded_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#Get list of all unique owner_user\n", + "\n", + "unique_names = non_embedded_df[\"owner_user\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['mercyog' 'Rameshwar0852' 'CodingWithHarry' 'TerakomariGandesblood'\n", + " 'lebrancconvas' 'carlosallexandre' 'Endarzboy' 'GokhanRepo' 'jonasbn'\n", + " 'WesleyRodrigues55' 'comxd' 'v1xingyue' 'normantodd' 'indrakishore'\n", + " 'nelsontang' 'Sendan4' 'k3ntar0' 'kbjude' 'friism' 'burnflare' 'pirahawk'\n", + " 'shojib' 'neodigm' 'RaffaelSchemmer' 'ashishpatel1992' 'ibocon'\n", + " 'yongfengxu' 'jrcastine' 'include' 'mfeldman143' 'crazedRomeo'\n", + " 'vivekky57' 'JPKovacs' 'philippeboyd' 'pid1' 'ITOPanda' 'kizen777'\n", + " 'EIETMC2' 'rodrich' 'Bnowako' '00-Python' 'killsnow' 'Sagiri18'\n", + " 'trantuanngoc' 'gp48maz1' 'chinahappyking' 'chenxiing' '1sagarcharaniya1'\n", + " 'keepallsimple' 'ku' 'regina-book' 'KarthickAN' 'goldenminerlmg'\n", + " 'elitongadotti' 'LeeKangHyun' 'ambitionli' 'xiedacon' 'ahahh' 'davidu'\n", + " 'csendranshi' 'JulianWe' 'Antonio24' 'aquateen' 'flom84' 'yoshikinoue'\n", + " 'k1selman' 'vin-node' 'jderrett' 'jhhb' 'sebas1989' 'abrahamsod'\n", + " 'zirtaebn' 'LulaV14' 'CuteMing' 'teckick' 'joeldrapper' 'sifanxu1996'\n", + " 'rtfeldman' 'srinivashappy' 'nazieb' 'ymdysk' 'rurutea' 'QuteMiao'\n", + " 'EsMaybe' 'CarloDotLog' 'xhebox' 'TooSchoolForCool' 'KarthikJagadish'\n", + " 'pwnall' 'DBLESSED1' 'MoOx' 'Thirteentj' 'LegendAJJ' 'go-diego' 'wez'\n", + " 'mariotsvetanov' 'Aronfeyman' 'beerkaya' 'pros2021' 'railsbob']\n" + ] + } + ], + "source": [ + "print(unique_names[:100])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codecompasslib/embeddings/embeddings_helper_functions.py b/codecompasslib/embeddings/embeddings_helper_functions.py index 8a13a2f..abc9203 100644 --- a/codecompasslib/embeddings/embeddings_helper_functions.py +++ b/codecompasslib/embeddings/embeddings_helper_functions.py @@ -2,25 +2,25 @@ import pandas as pd from gensim.models.keyedvectors import KeyedVectors from sentence_transformers import SentenceTransformer -from langchain_community.embeddings import OllamaEmbeddings +#from langchain_community.embeddings import OllamaEmbeddings from gensim.models import KeyedVectors import openai -def add_embeddings_to_existing_dataset(df1, df2): - """ - Combines two DataFrames containing embeddings into a single DataFrame. +# def add_embeddings_to_existing_dataset(df1, df2): +# """ +# Combines two DataFrames containing embeddings into a single DataFrame. - Args: - df1 (pandas.DataFrame): The first DataFrame containing embeddings. - df2 (pandas.DataFrame): The second DataFrame containing embeddings. +# Args: +# df1 (pandas.DataFrame): The first DataFrame containing embeddings. +# df2 (pandas.DataFrame): The second DataFrame containing embeddings. - Returns: - pandas.DataFrame: A DataFrame containing the combined embeddings. +# Returns: +# pandas.DataFrame: A DataFrame containing the combined embeddings. - """ - df_combined = pd.concat([df1, df2], axis=0) - df_combined = df_combined.drop_duplicates(subset='id', keep='first') # Remove duplicates - return df_combined +# """ +# df_combined = pd.concat([df1, df2], axis=0) +# df_combined = df_combined.drop_duplicates(subset='id', keep='first') # Remove duplicates +# return df_combined def load_word2vec_model(): """ diff --git a/codecompasslib/embeddings/generate_embedded_dataset.py b/codecompasslib/embeddings/generate_embedded_dataset.py index aaa05a7..7927ffa 100644 --- a/codecompasslib/embeddings/generate_embedded_dataset.py +++ b/codecompasslib/embeddings/generate_embedded_dataset.py @@ -8,17 +8,28 @@ # Add the project directory to the Python path sys.path.insert(0, real_project_dir) -from codecompasslib.API.drive_operations import get_creds_drive, list_shared_drive_contents, download_csv_as_pd_dataframe, upload_df_to_drive_as_csv from codecompasslib.embeddings.embeddings_helper_functions import generate_openAI_embeddings from codecompasslib.models.secrets_manager import load_openai_key + import openai import pandas as pd +import redis +import json +import numpy as np +from redis import Redis + +# Redis client constants +REDIS_HOST = 'localhost' +REDIS_PORT = 6379 +REDIS_DB = 0 +#Initialize Redis client +redis_client = Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True) -# generate embedded dataset using OpenAI embeddings -def generate_openAI_embedded_csv(df, column_to_embed): +# Generate embedded dataset using OpenAI embeddings +def generate_openAI_embedded_to_redis(df, column_to_embed): """ - Generates embeddings for a given textual column in a DataFrame and saves the embeddings to a CSV file. + Generates embeddings for a given textual column in a DataFrame and saves the embeddings to Redis. Args: df (pandas.DataFrame): The DataFrame containing the data. @@ -34,94 +45,70 @@ def generate_openAI_embedded_csv(df, column_to_embed): df = pd.DataFrame({'id': [1, 2, 3], 'text': ['Hello', 'World', 'GitHub']}) df_with_embeddings = generate_openAI_embedded_csv(df, 'text') """ - # remove rows with missing values (We still have a very big dataset after removing the missing values anyway) + # Remove rows with missing values df_clean = df.dropna() - - # turn description to lowercase and remove row if description="no description" or empty string + + # Turn description to lowercase and remove rows if description="no description" or empty string df_clean = df_clean[df_clean[column_to_embed].str.lower() != 'no description'] - - # cut text if it's size exceeds 8000 tokens - df_clean[column_to_embed] = df_clean[column_to_embed].apply(lambda x: x[:8190]) # due to openAI API limit - - # grab api key from secrets + + # Cut text if its size exceeds 8000 tokens + df_clean[column_to_embed] = df_clean[column_to_embed].apply(lambda x: x[:8190]) # due to OpenAI API limit + + # Grab API key from secrets api_key = load_openai_key() client = openai.Client(api_key=api_key) - - # extract textual column as list of strings + + # Extract textual column as list of strings textual_column = df_clean[column_to_embed].values.tolist() - # extract id + # Extract IDs and owner_users ids = df_clean['id'].values.tolist() + owner_users = df_clean['owner_user'].values.tolist() - # create an emptry dataframe to store the embeddings - embedding_size = len(generate_openAI_embeddings('Test textual column', client).data[0].embedding) + # Create an empty DataFrame to store the embeddings + embedding_size = len(generate_openAI_embeddings('Test text for embedding', client).data[0].embedding) embeddings_columns = ['embedding_' + str(i) for i in range(embedding_size)] df_with_embeddings = pd.DataFrame(columns=['id', 'owner_user'] + embeddings_columns) - batch_size = 2040 # You can adjust this value based on the API limits and your requirements + + batch_size = 2040 # Adjust this value based on the API limits and your requirements # Iterate over every batch of textual column for i in range(0, len(textual_column), batch_size): - if i % (batch_size*10) == 0: + if i % (batch_size * 10) == 0: print(f"Processing batch starting at index: {i}") - + # Get the current batch of textual column - descriptions_batch = textual_column[i:i+batch_size] - + descriptions_batch = textual_column[i:i + batch_size] + # Get the embeddings for the current batch embeddings_response = generate_openAI_embeddings(descriptions_batch, client) - # Create a DataFrame for the current batch batch_df = pd.DataFrame(columns=['id', 'owner_user'] + embeddings_columns) - batch_df['id'] = ids[i:i+batch_size] - batch_df['owner_user'] = owner_users[i:i+batch_size] - + batch_df['id'] = ids[i:i + batch_size] + batch_df['owner_user'] = owner_users[i:i + batch_size] + # Extract the embeddings and convert them into a list of lists embeddings_list = [embedding.embedding for embedding in embeddings_response.data] # Convert the list of lists into a DataFrame embeddings_df = pd.DataFrame(embeddings_list, dtype='float16') - # Assuming 'batch_df' is your original DataFrame and you want to add the embeddings to it - # Make sure 'batch_df' has the same number of rows as 'embeddings_df' + # Assuming 'batch_df' is the original DataFrame, add the embeddings to it batch_df[embeddings_columns] = embeddings_df - - # Save the current batch DataFrame to a CSV file - # Mode 'a' is for append, header=False to avoid writing headers multiple times - batch_df.to_csv('df_embedded_0504_batch.csv', mode='a', header=not i, index=False) - - # Optional: Free up memory by deleting the batch DataFrame if no longer needed - del batch_df - - # Load the CSV file with the embeddings - df_with_embeddings = pd.read_csv('df_embedded_0504_batch.csv') - return df_with_embeddings - -def main(): - # Load the dataset - DRIVE_ID = "0AL1DtB4TdEWdUk9PVA" - DATA_FOLDER = "13JitBJQLNgMvFwx4QJcvrmDwKOYAShVx" - creds = get_creds_drive() - list_shared_drive_contents(creds=creds, folder_id=DATA_FOLDER, drive_id=DRIVE_ID) - - df = download_csv_as_pd_dataframe(creds,"1WSgwAhzNbSqC6e_RRBDHpgpQCnGZvVcc") - - columns_to_retrieve = ['id', 'name', 'owner_user', 'description', 'stars', 'language'] - - # retrieve only the columns that are needed - df = df[columns_to_retrieve] - - # Define the column to embed - column_to_embed = 'description' - - # Generate the embedded dataset - df_embedded = generate_openAI_embedded_csv(df, column_to_embed) - - # save the dataframe with embeddings to drive - upload_df_to_drive_as_csv(creds, df_embedded, "df_embedded_0504.csv", DATA_FOLDER) + # Store each embedding in Redis + for idx, row in batch_df.iterrows(): + # print(f"Storing embedding for ID: {row['id']} under the key: embedded:{row['id']}") + redis_key = f"embedded:{row['id']}" # Use repository ID as the Redis key + redis_client.set(redis_key, json.dumps(row[embeddings_columns].tolist())) # Store as JSON string + + # return df_with_embeddings # MAYBE DROP THE RETURN? JUST TO LOAD THE DATA INTO REDIS (MAYBE MAKE FUNCTION TO SAVE TO REDIS FROM DF??) + -if __name__ == "__main__": - main() +#If running main script it will start generating the embeddings from the local csv +if __name__ == "__main___": + df = pd.read_csv(f"{real_project_dir}/data/data_full.csv") + generate_openAI_embedded_to_redis(df, 'description') diff --git a/codecompasslib/models/cosine_similarity_model.py b/codecompasslib/models/cosine_similarity_model.py index 9dafd97..288aa3a 100644 --- a/codecompasslib/models/cosine_similarity_model.py +++ b/codecompasslib/models/cosine_similarity_model.py @@ -25,7 +25,7 @@ # Add the project directory to the Python path sys.path.insert(0, real_project_dir) -from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive +from codecompasslib.API.drive_old.drive_operations import download_csv_as_pd_dataframe, get_creds_drive def load_data(full_data_folder_id: str) -> DataFrame: """ diff --git a/codecompasslib/models/lightgbm_model.py b/codecompasslib/models/lightgbm_model.py index 9d21174..58e2687 100644 --- a/codecompasslib/models/lightgbm_model.py +++ b/codecompasslib/models/lightgbm_model.py @@ -1,5 +1,7 @@ import os import sys +import streamlit as st +import pandas as pd # go up to root # Construct the path to the root directory (one level up from embeddings) @@ -10,6 +12,8 @@ sys.path.insert(0, real_project_dir) import pandas as pd +import json +import redis from typing import Tuple, List from pandas import DataFrame, concat from numpy import ndarray, argsort @@ -17,9 +21,8 @@ from sklearn.model_selection import train_test_split from category_encoders import ordinal -from codecompasslib.API.drive_operations import download_csv_as_pd_dataframe, get_creds_drive from codecompasslib.API.get_bulk_data import get_stared_repos, get_user_repos - +from codecompasslib.API.redis_operations import redis_to_dataframe, load_non_embedded_data def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tuple[DataFrame, ndarray]: """ @@ -38,6 +41,37 @@ def encode_csv(df: DataFrame, encoder, label_col: str, typ: str = "fit") -> Tupl del df[label_col] return df, y +def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame, + label_col: str, target_user: str) -> DataFrame: + """ + Preprocesses the data by merging embedded and non-embedded datasets, + converting the 'stars' column to integer, adding a target column, + and dropping unnecessary columns. + + Args: + df_embedded (DataFrame): The embedded dataset. + df_non_embedded (DataFrame): The non-embedded dataset. + label_col (str): The name of the target label column. + target_user (str): The username of the target user. + + Returns: + DataFrame: The preprocessed dataset. + List: List of repo IDs that are either starred or owned by the target user. + """ + # Merge the embedded and non-embedded datasets (match based on ID), grab the column you need for training + df_merged: DataFrame = pd.merge(df_embedded, df_non_embedded[['id', 'stars', 'language', 'owner_user']], on='id', how='left') + + # Turn stars column into integer column + df_merged['stars'] = df_merged['stars'].astype(int) + + # Add target column: 1 if the repo is starred or owned by the user, else 0 + owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]] + starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]] + starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids + df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0) + + return df_merged, starred_or_owned_by_user + def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Booster, ordinal.OrdinalEncoder]: """ @@ -122,77 +156,18 @@ def train_lightGBM_model(df_merged: DataFrame, label_col: str) -> Tuple[lgb.Boos return lgb_model, ord_encoder -def load_data(full_data_folder_id: str, full_data_embedded_folder_id: str) -> Tuple[DataFrame, DataFrame]: - """ - Load the data from the Google Drive - :return: The non-embedded and embedded datasets - """ - - creds = get_creds_drive() - df_non_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_folder_id) - df_embedded: DataFrame = download_csv_as_pd_dataframe(creds=creds, file_id=full_data_embedded_folder_id) - - # Having data locally works much faster than retrieving from drive. Uncomment the following lines to use local data - # df_non_embedded = pd.read_csv('codecompasslib/models/data_full.csv') - # df_embedded = pd.read_csv('codecompasslib/models/df_embedded_combined.csv') - - print("Data loaded") - return df_non_embedded, df_embedded - - -def preprocess_data(df_embedded: DataFrame, df_non_embedded: DataFrame, - label_col: str, target_user: str) -> DataFrame: - """ - Preprocesses the data by merging embedded and non-embedded datasets, - converting the 'stars' column to integer, adding a target column, - and dropping unnecessary columns. - - Args: - df_embedded (DataFrame): The embedded dataset. - df_non_embedded (DataFrame): The non-embedded dataset. - label_col (str): The name of the target label column. - target_user (str): The username of the target user. - - Returns: - DataFrame: The preprocessed dataset. - List: List of repo IDs that are either starred or owned by the target user. - """ - # Merge the embedded and non-embedded datasets (match based on ID), grab the column you need for training - df_merged: DataFrame = pd.merge(df_embedded, df_non_embedded[['id', 'stars', 'language']], on='id', how='left') - - # Turn stars column into integer column - df_merged['stars'] = df_merged['stars'].astype(int) - - # Add target column: 1 if the repo is starred or owned by the user, else 0 - owned_by_target_repo_ids: List = [item['id'] for item in get_user_repos(target_user)[0]] - starred_repo_ids: List = [item['id'] for item in get_stared_repos(target_user)[0]] - starred_or_owned_by_user:List = starred_repo_ids + owned_by_target_repo_ids - df_merged[label_col] = df_merged['id'].apply(lambda x: 1 if x in starred_or_owned_by_user else 0) - - return df_merged, starred_or_owned_by_user - - def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFrame, df_embedded: DataFrame, number_of_recommendations: int = 10) -> list: """ Generates recommendations using the LightGBM model. - - Args: - target_user (str): The target user for whom recommendations are generated. - df_non_embedded (DataFrame): The non-embedded data frame containing the features. - df_embedded (DataFrame): The embedded data frame containing the features. - label_col (str): The name of the label column. - number_of_recommendations (int, optional): The number of recommendations to generate. Defaults to 10. - - Returns: - list: A list of recommendations, each containing the repository name, owner user, and prediction score. """ + # Preprocess data label_col: str = 'target' df_merged, starred_or_owned_by_user = preprocess_data(df_embedded, df_non_embedded, label_col, target_user) - + df_training_ready: DataFrame = df_merged.drop(columns=['id', 'owner_user']) - + lgb_model: lgb.Booster ord_encoder: ordinal.OrdinalEncoder # Train LightGBM model @@ -219,3 +194,14 @@ def generate_lightGBM_recommendations(target_user: str, df_non_embedded: DataFra recommendations.append((df_merged.iloc[index]['id'], df_merged.iloc[index]['owner_user'], all_preds[index])) return recommendations + + + + + + + + + + + diff --git a/codecompasslib/models/redis_testing.ipynb b/codecompasslib/models/redis_testing.ipynb new file mode 100644 index 0000000..7070b9a --- /dev/null +++ b/codecompasslib/models/redis_testing.ipynb @@ -0,0 +1,1663 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Directory: /Users/maudhelenhovland/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/codecompasslib/models\n", + "Project Directory: /Users/maudhelenhovland/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/codecompasslib\n", + "Real Project Directory: /Users/maudhelenhovland/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import importlib\n", + "\n", + "# Setting path\n", + "# Get the current working directory\n", + "current_dir = os.getcwd() # This is the directory where the notebook is located\n", + "project_dir = os.path.abspath(os.path.join(current_dir, '..')) # One level up from the notebook's directory\n", + "real_project_dir = os.path.abspath(os.path.join(project_dir, '..')) # Two levels up to the project directory\n", + "datafolder = real_project_dir + 'data/'\n", + "\n", + "# Add the project directory to the Python path\n", + "sys.path.insert(0, real_project_dir)\n", + "\n", + "# Optional: Check the paths\n", + "print(\"Current Directory:\", current_dir)\n", + "print(\"Project Directory:\", project_dir)\n", + "print(\"Real Project Directory:\", real_project_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token loaded successfully.\n" + ] + } + ], + "source": [ + "from codecompasslib.models.lightgbm_model import load_non_embedded_data, generate_lightGBM_recommendations, preprocess_data\n", + "from codecompasslib.API.redis_operations import redis_to_dataframe, load_csv_to_redis" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/maudhelenhovland/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/codecompasslib/API/redis_operations.py:70: DtypeWarning: Columns (6,11,12,15,16,17,18,19,20,21,22,23,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df_non_embedded = read_csv(datafolder + fname)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Non-embedded dataset \n", + "\n", + "Types: \n", + "\n", + "\n", + "RangeIndex: 2583825 entries, 0 to 2583824\n", + "Data columns (total 28 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 id float64\n", + " 1 name object \n", + " 2 owner_user object \n", + " 3 owner_type object \n", + " 4 description object \n", + " 5 url object \n", + " 6 is_fork object \n", + " 7 date_created object \n", + " 8 date_updated object \n", + " 9 date_pushed object \n", + " 10 size float64\n", + " 11 stars object \n", + " 12 watchers object \n", + " 13 updated_at object \n", + " 14 language object \n", + " 15 has_issues object \n", + " 16 has_projects object \n", + " 17 has_downloads object \n", + " 18 has_wiki object \n", + " 19 has_pages object \n", + " 20 has _discussions object \n", + " 21 num_forks object \n", + " 22 is_archived object \n", + " 23 is_disabled object \n", + " 24 is_template object \n", + " 25 license object \n", + " 26 open_issues float64\n", + " 27 topics object \n", + "dtypes: float64(3), object(25)\n", + "memory usage: 552.0+ MB\n", + "None\n" + ] + } + ], + "source": [ + "non_embeeded = load_non_embedded_data(\"data_full.csv\")\n", + "\n", + "print(\"\\nNon-embedded dataset \\n\\nTypes: \\n\")\n", + "print(non_embeeded.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + " Column names: \n", + "\n", + "Index(['id', 'name', 'owner_user', 'owner_type', 'description', 'url',\n", + " 'is_fork', 'date_created', 'date_updated', 'date_pushed', 'size',\n", + " 'stars', 'watchers', 'updated_at', 'language', 'has_issues',\n", + " 'has_projects', 'has_downloads', 'has_wiki', 'has_pages',\n", + " 'has _discussions', 'num_forks', 'is_archived', 'is_disabled',\n", + " 'is_template', 'license', 'open_issues', 'topics'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(\"\\n\\n Column names: \\n\")\n", + "print(non_embeeded.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameowner_userowner_typedescriptionurlis_forkdate_createddate_updateddate_pushed...has_wikihas_pageshas _discussionsnum_forksis_archivedis_disabledis_templatelicenseopen_issuestopics
0444741024.0doc.aurora.dev-develop-compat-evmmercyogUserNo descriptionhttps://api.github.com/repos/mercyog/doc.auror...False2022-01-05T09:24:59Z2023-03-05T00:05:37Z2022-01-05T09:24:59Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
1404812692.0White_PapermercyogUserWhite Paper for Choice Coinhttps://api.github.com/repos/mercyog/White_PaperTrue2021-09-09T17:30:02Z2023-03-05T00:05:37Z2021-09-09T00:41:10Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
2462013111.0Advance-SQL-AssignmentRameshwar0852UserAdvance SQL assignment.https://api.github.com/repos/Rameshwar0852/Adv...True2022-02-21T20:01:28Z2024-03-09T14:13:51Z2020-08-05T20:54:22Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
3462013123.0anosqlRameshwar0852UserEasy SQL in Pythonhttps://api.github.com/repos/Rameshwar0852/anosqlTrue2022-02-21T20:01:29Z2024-03-09T14:13:51Z2020-09-09T18:47:09Z...TrueFalseFalse0FalseFalseFalseOther0.0[]
4585055299.0Automation_ProjectRameshwar0852UserAutomated Bash Script to automate log Backup g...https://api.github.com/repos/Rameshwar0852/Aut...False2023-01-04T07:49:35Z2024-03-09T14:13:42Z2023-01-04T17:15:46Z...TrueFalseFalse0FalseFalseFalseNo license0.0[]
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " id name owner_user owner_type \\\n", + "0 444741024.0 doc.aurora.dev-develop-compat-evm mercyog User \n", + "1 404812692.0 White_Paper mercyog User \n", + "2 462013111.0 Advance-SQL-Assignment Rameshwar0852 User \n", + "3 462013123.0 anosql Rameshwar0852 User \n", + "4 585055299.0 Automation_Project Rameshwar0852 User \n", + "\n", + " description \\\n", + "0 No description \n", + "1 White Paper for Choice Coin \n", + "2 Advance SQL assignment. \n", + "3 Easy SQL in Python \n", + "4 Automated Bash Script to automate log Backup g... \n", + "\n", + " url is_fork \\\n", + "0 https://api.github.com/repos/mercyog/doc.auror... False \n", + "1 https://api.github.com/repos/mercyog/White_Paper True \n", + "2 https://api.github.com/repos/Rameshwar0852/Adv... True \n", + "3 https://api.github.com/repos/Rameshwar0852/anosql True \n", + "4 https://api.github.com/repos/Rameshwar0852/Aut... False \n", + "\n", + " date_created date_updated date_pushed ... \\\n", + "0 2022-01-05T09:24:59Z 2023-03-05T00:05:37Z 2022-01-05T09:24:59Z ... \n", + "1 2021-09-09T17:30:02Z 2023-03-05T00:05:37Z 2021-09-09T00:41:10Z ... \n", + "2 2022-02-21T20:01:28Z 2024-03-09T14:13:51Z 2020-08-05T20:54:22Z ... \n", + "3 2022-02-21T20:01:29Z 2024-03-09T14:13:51Z 2020-09-09T18:47:09Z ... \n", + "4 2023-01-04T07:49:35Z 2024-03-09T14:13:42Z 2023-01-04T17:15:46Z ... \n", + "\n", + " has_wiki has_pages has _discussions num_forks is_archived is_disabled \\\n", + "0 True False False 0 False False \n", + "1 True False False 0 False False \n", + "2 True False False 0 False False \n", + "3 True False False 0 False False \n", + "4 True False False 0 False False \n", + "\n", + " is_template license open_issues topics \n", + "0 False No license 0.0 [] \n", + "1 False No license 0.0 [] \n", + "2 False No license 0.0 [] \n", + "3 False Other 0.0 [] \n", + "4 False No license 0.0 [] \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_embeeded.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "embedded = redis_to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Embedded dataset \n", + "\n", + "Types: \n", + "\n", + "\n", + "RangeIndex: 1205027 entries, 0 to 1205026\n", + "Columns: 257 entries, id to embedding_255\n", + "dtypes: float64(257)\n", + "memory usage: 2.3 GB\n", + "None\n" + ] + } + ], + "source": [ + "print(\"\\nEmbedded dataset \\n\\nTypes: \\n\")\n", + "print(embedded.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + " Column names: \n", + "\n", + "Index(['id', 'embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',\n", + " 'embedding_4', 'embedding_5', 'embedding_6', 'embedding_7',\n", + " 'embedding_8',\n", + " ...\n", + " 'embedding_246', 'embedding_247', 'embedding_248', 'embedding_249',\n", + " 'embedding_250', 'embedding_251', 'embedding_252', 'embedding_253',\n", + " 'embedding_254', 'embedding_255'],\n", + " dtype='object', length=257)\n" + ] + } + ], + "source": [ + "print(\"\\n\\n Column names: \\n\")\n", + "print(embedded.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idembedding_0embedding_1embedding_2embedding_3embedding_4embedding_5embedding_6embedding_7embedding_8...embedding_246embedding_247embedding_248embedding_249embedding_250embedding_251embedding_252embedding_253embedding_254embedding_255
051728484.0-0.0729980.115051-0.0423890.032623-0.039032-0.035614-0.0392460.0781860.001712...0.0166780.153320-0.0425110.0568540.066040-0.032990-0.0358580.041168-0.0321660.133911
11845031.0-0.116882-0.038361-0.0673220.037048-0.024780-0.0470580.0129700.150635-0.132080...0.0723270.089905-0.0095060.0558780.056335-0.0229340.0070690.037811-0.0424190.092896
2640245327.0-0.0661010.001008-0.0230710.0423580.019363-0.0335080.0470280.134399-0.040833...-0.0507810.0341490.0647580.0048370.0079040.0223540.0155940.051422-0.027527-0.017197
3443559880.0-0.0060770.084595-0.0338440.069092-0.0261990.076233-0.0248720.1278080.007904...-0.0342100.1549070.162598-0.0902100.053070-0.0300140.0554500.035645-0.0364070.101624
438375571.0-0.052521-0.064087-0.019775-0.014755-0.040894-0.0651250.0214080.064697-0.084473...-0.113953-0.202881-0.024536-0.027328-0.0438840.011597-0.032959-0.010086-0.0032600.078552
\n", + "

5 rows × 257 columns

\n", + "
" + ], + "text/plain": [ + " id embedding_0 embedding_1 embedding_2 embedding_3 \\\n", + "0 51728484.0 -0.072998 0.115051 -0.042389 0.032623 \n", + "1 1845031.0 -0.116882 -0.038361 -0.067322 0.037048 \n", + "2 640245327.0 -0.066101 0.001008 -0.023071 0.042358 \n", + "3 443559880.0 -0.006077 0.084595 -0.033844 0.069092 \n", + "4 38375571.0 -0.052521 -0.064087 -0.019775 -0.014755 \n", + "\n", + " embedding_4 embedding_5 embedding_6 embedding_7 embedding_8 ... \\\n", + "0 -0.039032 -0.035614 -0.039246 0.078186 0.001712 ... \n", + "1 -0.024780 -0.047058 0.012970 0.150635 -0.132080 ... \n", + "2 0.019363 -0.033508 0.047028 0.134399 -0.040833 ... \n", + "3 -0.026199 0.076233 -0.024872 0.127808 0.007904 ... \n", + "4 -0.040894 -0.065125 0.021408 0.064697 -0.084473 ... \n", + "\n", + " embedding_246 embedding_247 embedding_248 embedding_249 embedding_250 \\\n", + "0 0.016678 0.153320 -0.042511 0.056854 0.066040 \n", + "1 0.072327 0.089905 -0.009506 0.055878 0.056335 \n", + "2 -0.050781 0.034149 0.064758 0.004837 0.007904 \n", + "3 -0.034210 0.154907 0.162598 -0.090210 0.053070 \n", + "4 -0.113953 -0.202881 -0.024536 -0.027328 -0.043884 \n", + "\n", + " embedding_251 embedding_252 embedding_253 embedding_254 embedding_255 \n", + "0 -0.032990 -0.035858 0.041168 -0.032166 0.133911 \n", + "1 -0.022934 0.007069 0.037811 -0.042419 0.092896 \n", + "2 0.022354 0.015594 0.051422 -0.027527 -0.017197 \n", + "3 -0.030014 0.055450 0.035645 -0.036407 0.101624 \n", + "4 0.011597 -0.032959 -0.010086 -0.003260 0.078552 \n", + "\n", + "[5 rows x 257 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embedded.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idembedding_0embedding_1embedding_2embedding_3embedding_4embedding_5embedding_6embedding_7embedding_8...embedding_250embedding_251embedding_252embedding_253embedding_254embedding_255starslanguageowner_usertarget
0130509551.00.0390600.02786-0.0054970.016720.020080-0.031920.1089500.11800-0.05210...-0.0068360.0112700.087800-0.009640-0.144300-0.077450Cudabarseghyanartur0
138375571.0-0.052460-0.06415-0.019880-0.01478-0.040860-0.065060.0214700.06450-0.08435...-0.0438800.011540-0.032960-0.010086-0.0032800.078600HTMLleonirlopes0
2737250560.0-0.080100-0.07544-0.036740-0.06665-0.0694000.03308-0.0158400.14070-0.02538...0.0424200.099000-0.0136800.085100-0.0125050.047208Rustlilydjwg0
3194578827.0-0.128700-0.01314-0.028080-0.110530.0310000.12470-0.0436000.18150-0.09430...-0.0421800.0271300.0578600.041350-0.0512400.023640Gozhsso0
4758946133.0-0.064900-0.08950-0.044200-0.047800.007130-0.15230-0.0486000.170400.06885...-0.040860-0.115540-0.0732000.034270-0.0215900.060402Rustjoelparkerhenderson0
..................................................................
1205022197403819.0-0.027000-0.03436-0.0249000.12225-0.0342000.02406-0.0415000.12780-0.10913...0.023060-0.001948-0.0008250.006508-0.011500-0.038851JavaScriptprzemyslawzalewski0
1205023178142966.0-0.0880000.01217-0.044160-0.070000.0263000.00918-0.0757000.01749-0.05624...-0.0300000.0259000.081800-0.042500-0.023480-0.044160PythonTobey1230
1205024479650052.0-0.039760-0.03122-0.0424000.036530.016390-0.02430-0.0014170.036220.02328...-0.025740-0.0782000.036320-0.003975-0.0112000.041960CSSHadzhieV7770
120502556554747.0-0.0912500.00616-0.036350-0.06050-0.005802-0.19030-0.0101550.07050-0.07190...0.0520000.0042800.0869000.102600-0.0124000.062870C++wyrover0
1205026139950081.00.0007720.08203-0.046360-0.01985-0.0015620.056850.0006210.12090-0.02058...0.0765400.0258000.018750-0.034270-0.0205800.133800JavaScriptthefreakingmind0
\n", + "

1205027 rows × 261 columns

\n", + "
" + ], + "text/plain": [ + " id embedding_0 embedding_1 embedding_2 embedding_3 \\\n", + "0 130509551.0 0.039060 0.02786 -0.005497 0.01672 \n", + "1 38375571.0 -0.052460 -0.06415 -0.019880 -0.01478 \n", + "2 737250560.0 -0.080100 -0.07544 -0.036740 -0.06665 \n", + "3 194578827.0 -0.128700 -0.01314 -0.028080 -0.11053 \n", + "4 758946133.0 -0.064900 -0.08950 -0.044200 -0.04780 \n", + "... ... ... ... ... ... \n", + "1205022 197403819.0 -0.027000 -0.03436 -0.024900 0.12225 \n", + "1205023 178142966.0 -0.088000 0.01217 -0.044160 -0.07000 \n", + "1205024 479650052.0 -0.039760 -0.03122 -0.042400 0.03653 \n", + "1205025 56554747.0 -0.091250 0.00616 -0.036350 -0.06050 \n", + "1205026 139950081.0 0.000772 0.08203 -0.046360 -0.01985 \n", + "\n", + " embedding_4 embedding_5 embedding_6 embedding_7 embedding_8 ... \\\n", + "0 0.020080 -0.03192 0.108950 0.11800 -0.05210 ... \n", + "1 -0.040860 -0.06506 0.021470 0.06450 -0.08435 ... \n", + "2 -0.069400 0.03308 -0.015840 0.14070 -0.02538 ... \n", + "3 0.031000 0.12470 -0.043600 0.18150 -0.09430 ... \n", + "4 0.007130 -0.15230 -0.048600 0.17040 0.06885 ... \n", + "... ... ... ... ... ... ... \n", + "1205022 -0.034200 0.02406 -0.041500 0.12780 -0.10913 ... \n", + "1205023 0.026300 0.00918 -0.075700 0.01749 -0.05624 ... \n", + "1205024 0.016390 -0.02430 -0.001417 0.03622 0.02328 ... \n", + "1205025 -0.005802 -0.19030 -0.010155 0.07050 -0.07190 ... \n", + "1205026 -0.001562 0.05685 0.000621 0.12090 -0.02058 ... \n", + "\n", + " embedding_250 embedding_251 embedding_252 embedding_253 \\\n", + "0 -0.006836 0.011270 0.087800 -0.009640 \n", + "1 -0.043880 0.011540 -0.032960 -0.010086 \n", + "2 0.042420 0.099000 -0.013680 0.085100 \n", + "3 -0.042180 0.027130 0.057860 0.041350 \n", + "4 -0.040860 -0.115540 -0.073200 0.034270 \n", + "... ... ... ... ... \n", + "1205022 0.023060 -0.001948 -0.000825 0.006508 \n", + "1205023 -0.030000 0.025900 0.081800 -0.042500 \n", + "1205024 -0.025740 -0.078200 0.036320 -0.003975 \n", + "1205025 0.052000 0.004280 0.086900 0.102600 \n", + "1205026 0.076540 0.025800 0.018750 -0.034270 \n", + "\n", + " embedding_254 embedding_255 stars language owner_user \\\n", + "0 -0.144300 -0.07745 0 Cuda barseghyanartur \n", + "1 -0.003280 0.07860 0 HTML leonirlopes \n", + "2 -0.012505 0.04720 8 Rust lilydjwg \n", + "3 -0.051240 0.02364 0 Go zhsso \n", + "4 -0.021590 0.06040 2 Rust joelparkerhenderson \n", + "... ... ... ... ... ... \n", + "1205022 -0.011500 -0.03885 1 JavaScript przemyslawzalewski \n", + "1205023 -0.023480 -0.04416 0 Python Tobey123 \n", + "1205024 -0.011200 0.04196 0 CSS HadzhieV777 \n", + "1205025 -0.012400 0.06287 0 C++ wyrover \n", + "1205026 -0.020580 0.13380 0 JavaScript thefreakingmind \n", + "\n", + " target \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "1205022 0 \n", + "1205023 0 \n", + "1205024 0 \n", + "1205025 0 \n", + "1205026 0 \n", + "\n", + "[1205027 rows x 261 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_merged, _ = preprocess_data(embedded, non_embeeded, \"target\", \"mercyog\")\n", + "df_merged" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idembedding_0embedding_1embedding_2embedding_3embedding_4embedding_5embedding_6embedding_7embedding_8...embedding_250embedding_251embedding_252embedding_253embedding_254embedding_255starslanguageowner_usertarget
28415883222441.0-0.0177800.10730-0.032500.0381000.020580.06805-0.026920.13750-0.04395...0.012400.049160-0.0498400.03607-0.148200.04040250323Pythondonnemartin0
75908245717250.0-0.068050-0.03784-0.024810.0136950.01224-0.05550-0.019590.13260-0.03061...-0.05017-0.001194-0.002592-0.05612-0.101140.05222181795C++tensorflow0
8357532325298.0-0.0055540.14210-0.028600.030730-0.059420.05792-0.015700.151900.03564...-0.013030.0657000.024290-0.07260-0.044040.06207167981Ctorvalds0
282695155220641.0-0.0014910.11100-0.050400.0098500.04740-0.02014-0.025020.06366-0.01021...-0.079600.0161100.0838600.03842-0.05057-0.02344123554Pythonhuggingface0
57853921540759.0-0.076540-0.01588-0.05377-0.0147900.016170.007980.016880.02441-0.03305...0.023740.1072000.0351300.06390-0.032500.10724118663Goavelino0
\n", + "

5 rows × 261 columns

\n", + "
" + ], + "text/plain": [ + " id embedding_0 embedding_1 embedding_2 embedding_3 \\\n", + "284158 83222441.0 -0.017780 0.10730 -0.03250 0.038100 \n", + "759082 45717250.0 -0.068050 -0.03784 -0.02481 0.013695 \n", + "835753 2325298.0 -0.005554 0.14210 -0.02860 0.030730 \n", + "282695 155220641.0 -0.001491 0.11100 -0.05040 0.009850 \n", + "578539 21540759.0 -0.076540 -0.01588 -0.05377 -0.014790 \n", + "\n", + " embedding_4 embedding_5 embedding_6 embedding_7 embedding_8 ... \\\n", + "284158 0.02058 0.06805 -0.02692 0.13750 -0.04395 ... \n", + "759082 0.01224 -0.05550 -0.01959 0.13260 -0.03061 ... \n", + "835753 -0.05942 0.05792 -0.01570 0.15190 0.03564 ... \n", + "282695 0.04740 -0.02014 -0.02502 0.06366 -0.01021 ... \n", + "578539 0.01617 0.00798 0.01688 0.02441 -0.03305 ... \n", + "\n", + " embedding_250 embedding_251 embedding_252 embedding_253 \\\n", + "284158 0.01240 0.049160 -0.049840 0.03607 \n", + "759082 -0.05017 -0.001194 -0.002592 -0.05612 \n", + "835753 -0.01303 0.065700 0.024290 -0.07260 \n", + "282695 -0.07960 0.016110 0.083860 0.03842 \n", + "578539 0.02374 0.107200 0.035130 0.06390 \n", + "\n", + " embedding_254 embedding_255 stars language owner_user target \n", + "284158 -0.14820 0.04040 250323 Python donnemartin 0 \n", + "759082 -0.10114 0.05222 181795 C++ tensorflow 0 \n", + "835753 -0.04404 0.06207 167981 C torvalds 0 \n", + "282695 -0.05057 -0.02344 123554 Python huggingface 0 \n", + "578539 -0.03250 0.10724 118663 Go avelino 0 \n", + "\n", + "[5 rows x 261 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#sort df merged by most stars\n", + "df_merged = df_merged.sort_values(by='stars', ascending=False)\n", + "df_merged.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training LightGBM model\n" + ] + }, + { + "ename": "ValueError", + "evalue": "The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m recos \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate_lightGBM_recommendations\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmercyog\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnon_embeeded\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedded\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/codecompasslib/models/lightgbm_model.py:174\u001b[0m, in \u001b[0;36mgenerate_lightGBM_recommendations\u001b[0;34m(target_user, df_non_embedded, df_embedded, number_of_recommendations)\u001b[0m\n\u001b[1;32m 172\u001b[0m ord_encoder: ordinal\u001b[38;5;241m.\u001b[39mOrdinalEncoder\n\u001b[1;32m 173\u001b[0m \u001b[38;5;66;03m# Train LightGBM model\u001b[39;00m\n\u001b[0;32m--> 174\u001b[0m lgb_model, ord_encoder \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_lightGBM_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_training_ready\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel_col\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;66;03m# Make predictions for all repos\u001b[39;00m\n\u001b[1;32m 177\u001b[0m full_dataset_x, full_dataset_y \u001b[38;5;241m=\u001b[39m encode_csv(df_training_ready, ord_encoder, label_col, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransform\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/codecompasslib/models/lightgbm_model.py:130\u001b[0m, in \u001b[0;36mtrain_lightGBM_model\u001b[0;34m(df_merged, label_col)\u001b[0m\n\u001b[1;32m 127\u001b[0m y: DataFrame \u001b[38;5;241m=\u001b[39m df_merged[label_col]\n\u001b[1;32m 129\u001b[0m \u001b[38;5;66;03m# Dataset is imbalaned -> make sure that the stratify parameter is set\u001b[39;00m\n\u001b[0;32m--> 130\u001b[0m X_combined, X_test, y_combined, y_test \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m42\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstratify\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 131\u001b[0m X_train, X_val, y_train, y_val \u001b[38;5;241m=\u001b[39m train_test_split(X_combined, y_combined, test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.1\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m,\n\u001b[1;32m 132\u001b[0m stratify\u001b[38;5;241m=\u001b[39my_combined)\n\u001b[1;32m 134\u001b[0m \u001b[38;5;66;03m# combine X_train and y_train\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/sklearn/model_selection/_split.py:2583\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[0;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[1;32m 2579\u001b[0m CVClass \u001b[38;5;241m=\u001b[39m ShuffleSplit\n\u001b[1;32m 2581\u001b[0m cv \u001b[38;5;241m=\u001b[39m CVClass(test_size\u001b[38;5;241m=\u001b[39mn_test, train_size\u001b[38;5;241m=\u001b[39mn_train, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[0;32m-> 2583\u001b[0m train, test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstratify\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2585\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(\n\u001b[1;32m 2586\u001b[0m chain\u001b[38;5;241m.\u001b[39mfrom_iterable(\n\u001b[1;32m 2587\u001b[0m (_safe_indexing(a, train), _safe_indexing(a, test)) \u001b[38;5;28;01mfor\u001b[39;00m a \u001b[38;5;129;01min\u001b[39;00m arrays\n\u001b[1;32m 2588\u001b[0m )\n\u001b[1;32m 2589\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/sklearn/model_selection/_split.py:1689\u001b[0m, in \u001b[0;36mBaseShuffleSplit.split\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Generate indices to split data into training and test set.\u001b[39;00m\n\u001b[1;32m 1660\u001b[0m \n\u001b[1;32m 1661\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;124;03mto an integer.\u001b[39;00m\n\u001b[1;32m 1687\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1688\u001b[0m X, y, groups \u001b[38;5;241m=\u001b[39m indexable(X, y, groups)\n\u001b[0;32m-> 1689\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m train, test \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter_indices(X, y, groups):\n\u001b[1;32m 1690\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m train, test\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/sklearn/model_selection/_split.py:2078\u001b[0m, in \u001b[0;36mStratifiedShuffleSplit._iter_indices\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 2076\u001b[0m class_counts \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mbincount(y_indices)\n\u001b[1;32m 2077\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39mmin(class_counts) \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[0;32m-> 2078\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2079\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe least populated class in y has only 1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2080\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m member, which is too few. The minimum\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2081\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m number of groups for any class cannot\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2082\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m be less than 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2083\u001b[0m )\n\u001b[1;32m 2085\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m<\u001b[39m n_classes:\n\u001b[1;32m 2086\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2087\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe train_size = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m should be greater or \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2088\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mequal to the number of classes = \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m (n_train, n_classes)\n\u001b[1;32m 2089\u001b[0m )\n", + "\u001b[0;31mValueError\u001b[0m: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2." + ] + } + ], + "source": [ + "recos = generate_lightGBM_recommendations(\"mercyog\", non_embeeded, embedded)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "m e https://github.com/e/m\n" + ] + }, + { + "ename": "KeyError", + "evalue": "0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:196\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 0", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index, repo \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(recos):\n\u001b[0;32m----> 2\u001b[0m repo_id \u001b[38;5;241m=\u001b[39m \u001b[43mrepo\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;66;03m# Assuming repo ID is the first element\u001b[39;00m\n\u001b[1;32m 3\u001b[0m owner \u001b[38;5;241m=\u001b[39m repo[\u001b[38;5;241m1\u001b[39m] \n\u001b[1;32m 4\u001b[0m link \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://github.com/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mowner\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# Link to the respective GitHub repository\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/pandas/core/frame.py:4102\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4102\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4104\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/Desktop/OneDrive - IE/Y3Q2/chatbots_recos/CodeCompass/.venv/lib/python3.10/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 0" + ] + } + ], + "source": [ + "for index, repo in enumerate(recos):\n", + " repo_id = repo[0] # Assuming repo ID is the first element\n", + " owner = repo[1] \n", + " link = f\"https://github.com/{owner}/{repo_id}\" # Link to the respective GitHub repository\n", + " print(repo_id, owner, link)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id\n", + "embedding_0\n", + "embedding_1\n", + "embedding_2\n", + "embedding_3\n", + "embedding_4\n", + "embedding_5\n", + "embedding_6\n", + "embedding_7\n", + "embedding_8\n", + "embedding_9\n", + "embedding_10\n", + "embedding_11\n", + "embedding_12\n", + "embedding_13\n", + "embedding_14\n", + "embedding_15\n", + "embedding_16\n", + "embedding_17\n", + "embedding_18\n", + "embedding_19\n", + "embedding_20\n", + "embedding_21\n", + "embedding_22\n", + "embedding_23\n", + "embedding_24\n", + "embedding_25\n", + "embedding_26\n", + "embedding_27\n", + "embedding_28\n", + "embedding_29\n", + "embedding_30\n", + "embedding_31\n", + "embedding_32\n", + "embedding_33\n", + "embedding_34\n", + "embedding_35\n", + "embedding_36\n", + "embedding_37\n", + "embedding_38\n", + "embedding_39\n", + "embedding_40\n", + "embedding_41\n", + "embedding_42\n", + "embedding_43\n", + "embedding_44\n", + "embedding_45\n", + "embedding_46\n", + "embedding_47\n", + "embedding_48\n", + "embedding_49\n", + "embedding_50\n", + "embedding_51\n", + "embedding_52\n", + "embedding_53\n", + "embedding_54\n", + "embedding_55\n", + "embedding_56\n", + "embedding_57\n", + "embedding_58\n", + "embedding_59\n", + "embedding_60\n", + "embedding_61\n", + "embedding_62\n", + "embedding_63\n", + "embedding_64\n", + "embedding_65\n", + "embedding_66\n", + "embedding_67\n", + "embedding_68\n", + "embedding_69\n", + "embedding_70\n", + "embedding_71\n", + "embedding_72\n", + "embedding_73\n", + "embedding_74\n", + "embedding_75\n", + "embedding_76\n", + "embedding_77\n", + "embedding_78\n", + "embedding_79\n", + "embedding_80\n", + "embedding_81\n", + "embedding_82\n", + "embedding_83\n", + "embedding_84\n", + "embedding_85\n", + "embedding_86\n", + "embedding_87\n", + "embedding_88\n", + "embedding_89\n", + "embedding_90\n", + "embedding_91\n", + "embedding_92\n", + "embedding_93\n", + "embedding_94\n", + "embedding_95\n", + "embedding_96\n", + "embedding_97\n", + "embedding_98\n", + "embedding_99\n", + "embedding_100\n", + "embedding_101\n", + "embedding_102\n", + "embedding_103\n", + "embedding_104\n", + "embedding_105\n", + "embedding_106\n", + "embedding_107\n", + "embedding_108\n", + "embedding_109\n", + "embedding_110\n", + "embedding_111\n", + "embedding_112\n", + "embedding_113\n", + "embedding_114\n", + "embedding_115\n", + "embedding_116\n", + "embedding_117\n", + "embedding_118\n", + "embedding_119\n", + "embedding_120\n", + "embedding_121\n", + "embedding_122\n", + "embedding_123\n", + "embedding_124\n", + "embedding_125\n", + "embedding_126\n", + "embedding_127\n", + "embedding_128\n", + "embedding_129\n", + "embedding_130\n", + "embedding_131\n", + "embedding_132\n", + "embedding_133\n", + "embedding_134\n", + "embedding_135\n", + "embedding_136\n", + "embedding_137\n", + "embedding_138\n", + "embedding_139\n", + "embedding_140\n", + "embedding_141\n", + "embedding_142\n", + "embedding_143\n", + "embedding_144\n", + "embedding_145\n", + "embedding_146\n", + "embedding_147\n", + "embedding_148\n", + "embedding_149\n", + "embedding_150\n", + "embedding_151\n", + "embedding_152\n", + "embedding_153\n", + "embedding_154\n", + "embedding_155\n", + "embedding_156\n", + "embedding_157\n", + "embedding_158\n", + "embedding_159\n", + "embedding_160\n", + "embedding_161\n", + "embedding_162\n", + "embedding_163\n", + "embedding_164\n", + "embedding_165\n", + "embedding_166\n", + "embedding_167\n", + "embedding_168\n", + "embedding_169\n", + "embedding_170\n", + "embedding_171\n", + "embedding_172\n", + "embedding_173\n", + "embedding_174\n", + "embedding_175\n", + "embedding_176\n", + "embedding_177\n", + "embedding_178\n", + "embedding_179\n", + "embedding_180\n", + "embedding_181\n", + "embedding_182\n", + "embedding_183\n", + "embedding_184\n", + "embedding_185\n", + "embedding_186\n", + "embedding_187\n", + "embedding_188\n", + "embedding_189\n", + "embedding_190\n", + "embedding_191\n", + "embedding_192\n", + "embedding_193\n", + "embedding_194\n", + "embedding_195\n", + "embedding_196\n", + "embedding_197\n", + "embedding_198\n", + "embedding_199\n", + "embedding_200\n", + "embedding_201\n", + "embedding_202\n", + "embedding_203\n", + "embedding_204\n", + "embedding_205\n", + "embedding_206\n", + "embedding_207\n", + "embedding_208\n", + "embedding_209\n", + "embedding_210\n", + "embedding_211\n", + "embedding_212\n", + "embedding_213\n", + "embedding_214\n", + "embedding_215\n", + "embedding_216\n", + "embedding_217\n", + "embedding_218\n", + "embedding_219\n", + "embedding_220\n", + "embedding_221\n", + "embedding_222\n", + "embedding_223\n", + "embedding_224\n", + "embedding_225\n", + "embedding_226\n", + "embedding_227\n", + "embedding_228\n", + "embedding_229\n", + "embedding_230\n", + "embedding_231\n", + "embedding_232\n", + "embedding_233\n", + "embedding_234\n", + "embedding_235\n", + "embedding_236\n", + "embedding_237\n", + "embedding_238\n", + "embedding_239\n", + "embedding_240\n", + "embedding_241\n", + "embedding_242\n", + "embedding_243\n", + "embedding_244\n", + "embedding_245\n", + "embedding_246\n", + "embedding_247\n", + "embedding_248\n", + "embedding_249\n", + "embedding_250\n", + "embedding_251\n", + "embedding_252\n", + "embedding_253\n", + "embedding_254\n", + "embedding_255\n" + ] + } + ], + "source": [ + "for element in repo:\n", + " print(element)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/frontend/recommender/app.py b/frontend/recommender/app.py index 68b8cfc..3c3d875 100644 --- a/frontend/recommender/app.py +++ b/frontend/recommender/app.py @@ -12,23 +12,31 @@ sys.path.insert(0, real_project_dir) # Import necessary functions from codecompasslib -from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations, load_data +from codecompasslib.models.lightgbm_model import generate_lightGBM_recommendations,load_non_embedded_data +from codecompasslib.API.redis_operations import redis_to_dataframe + +@st.cache_data +def load_non_embedded_data_cached(file_path): + return load_non_embedded_data(file_path) + +@st.cache_data +def redis_to_dataframe_cached(): + return redis_to_dataframe() -# Function to load cached data def load_cached_data(): - # Check if data is already stored in session state if 'cached_data' not in st.session_state: with st.spinner('Fetching data from the server...'): - # Load data - full_data_folder_id = '1Qiy9u03hUthqaoBDr4VQqhKwtLJ2O3Yd' - full_data_embedded_folder_id = '139wi78iRzhwGZwxmI5WALoYocR-Rk9By' - st.session_state.cached_data = load_data(full_data_folder_id, full_data_embedded_folder_id) + df_non_embedded = load_non_embedded_data_cached("data_full.csv") + print("\nNon embedded data loaded.") + df_embedded = redis_to_dataframe_cached() + print("\nEmbedded data from Redis loaded") + st.session_state.cached_data = (df_non_embedded, df_embedded) return st.session_state.cached_data + def main(): # Load the data df_non_embedded, df_embedded = load_cached_data() - # Set app title st.title('GitHub Repo Recommendation System') @@ -38,7 +46,7 @@ def main(): # Button to get recommendations if st.button('Get Recommendations'): # Check if user exists in the dataset - if target_user not in df_embedded['owner_user'].values: + if target_user not in df_non_embedded['owner_user'].values: st.error("User not found in the dataset. Please enter a valid username.") else: # Generate recommendations diff --git a/tests/conftest.py b/tests/conftest.py index f9a8077..30d04c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ import pytest from google.oauth2.credentials import Credentials -from codecompasslib.API.drive_operations import get_creds_drive +from codecompasslib.API.drive_old.drive_operations import get_creds_drive from pandas import DataFrame from unittest.mock import patch, Mock, mock_open import json diff --git a/tests/test_drive.py b/tests/test_drive.py index c724f03..1234447 100644 --- a/tests/test_drive.py +++ b/tests/test_drive.py @@ -1,5 +1,5 @@ import pytest -from codecompasslib.API.drive_operations import (list_shared_drive_contents, download_csv_as_pd_dataframe, +from codecompasslib.API.drive_old.drive_operations import (list_shared_drive_contents, download_csv_as_pd_dataframe, upload_df_to_drive_as_csv)