From 78d676a88d83eab833e15a1e45d6efd3c0972cf5 Mon Sep 17 00:00:00 2001 From: patriciaflutterflow Date: Thu, 17 Apr 2025 02:34:14 -0400 Subject: [PATCH 1/2] added logic to vectorize docs and github action --- .github/scripts/vectorize.py | 184 +++++++++++++++++++++++++++ .github/scripts/vectorize_action.py | 16 +++ .github/workflows/vectorize-docs.yml | 40 ++++++ 3 files changed, 240 insertions(+) create mode 100644 .github/scripts/vectorize.py create mode 100644 .github/scripts/vectorize_action.py create mode 100644 .github/workflows/vectorize-docs.yml diff --git a/.github/scripts/vectorize.py b/.github/scripts/vectorize.py new file mode 100644 index 00000000..740efe0d --- /dev/null +++ b/.github/scripts/vectorize.py @@ -0,0 +1,184 @@ +from langchain.text_splitter import RecursiveCharacterTextSplitter +from pathlib import Path +import google.generativeai as genai +from google.generativeai import embed_content +from google.cloud import bigquery +import hashlib +import time +import random +from google.api_core.exceptions import ResourceExhausted, GoogleAPIError +import os +from dotenv import load_dotenv + +DOC_TABLES = {'doc_text', 'doc_text_vector'} +load_dotenv() + +genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) +bq_client = bigquery.Client() + +def chunk_markdown_file(markdown_text, chunk_size=500, chunk_overlap=50): + + # Create the text splitter with the Chroma-recommended separators + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separators=["\n\n", "\n", ".", "?", "!", " ", ""] + ) + + # Split the document into chunks + chunks = text_splitter.split_text(markdown_text) + + return chunks + +class Chunk: + def __init__(self, chunk_vector, chunk_text): + self.chunk_vector = chunk_vector + self.chunk_text = chunk_text + +def vectorize_chunk_with_retry(chunk, max_retries=5): + model = 'models/gemini-embedding-exp-03-07' + for attempt in range(max_retries): + try: + print(f"Embedding chunk (attempt {attempt + 1})...") + response = embed_content( + model=model, + content=chunk, + task_type="retrieval_document" + ) + embedding = response["embedding"] + return Chunk(embedding, chunk) + except ResourceExhausted as e: + wait = 2 ** attempt + random.uniform(0.5, 1.5) + print(f"🚧 Rate limit hit (429). Retrying in {wait:.2f}s...") + time.sleep(wait) + except GoogleAPIError as e: + print(f"🚨 Unexpected API error: {e}") + raise e + raise RuntimeError("❌ Max retries exceeded for Gemini embedding.") + + +def vectorize_chunk(chunk): + model = 'models/gemini-embedding-exp-03-07' + response = embed_content( + model=model, + content=chunk, + task_type="retrieval_document" # use "retrieval_query" for queries + ) + embedding = response["embedding"] + return Chunk(embedding, chunk) + +def compute_file_hash(file_path): + content = file_path.read_bytes() + return hashlib.sha256(content).hexdigest() + +def write_to_bq_doc_text(file_hash, content): + bq_client = bigquery.Client() + table = bq_client.get_table("flutterflow-io-6f20.documentation.doc_text") + errors =bq_client.insert_rows(table, [{"doc_id": file_hash, "doc_text": content}]) + if errors: + print(f"Encountered errors while inserting rows: {errors}") + else: + print("Successfully inserted rows into bigquery") + +def file_already_uploaded(table, file_hash): + if table not in DOC_TABLES: + raise ValueError(f"Invalid table: {table}") + dataset_table = f"flutterflow-io-6f20.documentation.{table}" + query = f""" + SELECT COUNT(*) as count + FROM `{dataset_table}` + WHERE doc_id = @file_hash + """ + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("file_hash", "STRING", file_hash), + ] + ) + result = bq_client.query(query, job_config=job_config).result() + return next(result).count > 0 + +def batch_write_to_bq(table, rows, batch_size=10): + for i in range(0, len(rows), batch_size): + batch = rows[i:i+batch_size] + errors = bq_client.insert_rows(table, batch) + if errors: + print(f"Errors inserting batch {i//batch_size}: {errors}") + else: + print(f"Successfully inserted batch {i//batch_size}") + +def write_to_bq_doc_text_vector(file_hash, content, chunks): + bq_client = bigquery.Client() + table_id = "flutterflow-io-6f20.documentation.doc_text_vector" + + # Step 1: Delete existing rows for the doc_id (now safe since we're not using streaming) + delete_query = f""" + DELETE FROM `{table_id}` + WHERE doc_id = @file_hash + """ + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("file_hash", "STRING", file_hash) + ] + ) + bq_client.query(delete_query, job_config=job_config).result() + + # Step 2: Generate vectorized rows + to_insert = [] + for i, chunk_text in enumerate(chunks): + chunk = vectorize_chunk(chunk_text) + row = { + "doc_id": file_hash, + "chunk_id": i, + "vectorized_chunk": chunk.chunk_vector, + "text": chunk.chunk_text + } + to_insert.append(row) + + # Step 3: Batch insert via load_table_from_json + job = bq_client.load_table_from_json( + to_insert, + table_id, + job_config=bigquery.LoadJobConfig( + write_disposition="WRITE_APPEND", # Append instead of overwrite + source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON + ) + ) + job.result() + print(f"Inserted {len(to_insert)} rows into {table_id}") + + +def backfill(): + current_file = Path(__file__).resolve() + + # Navigate to project root: `.github/scripts/` → project root → `docs/` + project_root = current_file.parents[2] + docs_dir = project_root / 'docs' + for file_path in docs_dir.rglob("*.md"): + file_hash = compute_file_hash(file_path) + with open(file_path, 'r', encoding='utf-8') as file: + markdown_text = file.read() + if not file_already_uploaded('doc_text', file_hash): + write_to_bq_doc_text(file_hash, markdown_text) + chunks = chunk_markdown_file(markdown_text) + if not file_already_uploaded('doc_text_vector', file_hash): + write_to_bq_doc_text_vector(file_hash, markdown_text, chunks) + else: + print(f"Skipping {file_path} because it already exists in BigQuery") + print("Finished vectorizing all files") + +def process_single_file(file_path: Path): + file_hash = compute_file_hash(file_path) + with open(file_path, 'r', encoding='utf-8') as file: + markdown_text = file.read() + chunks = chunk_markdown_file(markdown_text) + if not file_already_uploaded('doc_text', file_hash): + write_to_bq_doc_text(file_hash, markdown_text) + if not file_already_uploaded('doc_text_vector', file_hash): + write_to_bq_doc_text_vector(file_hash, markdown_text, chunks) + else: + print(f"Skipping {file_path} because it already exists in BigQuery") + +backfill() + + + diff --git a/.github/scripts/vectorize_action.py b/.github/scripts/vectorize_action.py new file mode 100644 index 00000000..1b4a2cc3 --- /dev/null +++ b/.github/scripts/vectorize_action.py @@ -0,0 +1,16 @@ +# .github/scripts/vectorize_action.py +import sys +from pathlib import Path +from vectorize import process_single_file + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Please provide a file path to process.") + sys.exit(1) + + file_path = Path(sys.argv[1]) + if file_path.exists(): + process_single_file(file_path) + else: + print(f"File not found: {file_path}") + sys.exit(1) diff --git a/.github/workflows/vectorize-docs.yml b/.github/workflows/vectorize-docs.yml new file mode 100644 index 00000000..07c63706 --- /dev/null +++ b/.github/workflows/vectorize-docs.yml @@ -0,0 +1,40 @@ +name: Vectorize Updated Docs + +on: + push: + branches: + - '**' # TODO: set to main when done testing + paths: + - 'flutterflow-documentation/docs/**/*.md' + +jobs: + vectorize: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install langchain google-cloud-bigquery google-generativeai + + - name: Authenticate GCP + env: + GOOGLE_APPLICATION_CREDENTIALS: ${{ secrets.GCP_CREDENTIALS }} + run: | + echo "${GOOGLE_APPLICATION_CREDENTIALS}" > gcp-key.json + export GOOGLE_APPLICATION_CREDENTIALS=gcp-key.json + + - name: Vectorize modified files + run: | + for file in $(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep '^flutterflow-documentation/docs/.*\.md$'); do + echo "Processing $file..." + python .github/scripts/vectorize_action.py "$file" + done From 2cab146fa7aa266deb2dc038aa9bca197ec46791 Mon Sep 17 00:00:00 2001 From: patriciaflutterflow Date: Thu, 17 Apr 2025 02:50:11 -0400 Subject: [PATCH 2/2] Add test markdown file for GitHub Action --- flutterflow-documentation/docs/accounts-billing/testfile.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 flutterflow-documentation/docs/accounts-billing/testfile.md diff --git a/flutterflow-documentation/docs/accounts-billing/testfile.md b/flutterflow-documentation/docs/accounts-billing/testfile.md new file mode 100644 index 00000000..9fb4ec04 --- /dev/null +++ b/flutterflow-documentation/docs/accounts-billing/testfile.md @@ -0,0 +1 @@ +# Test Document for GitHub Action