postgresml · santiatpml · Aug 16, 2023 · Aug 9, 2023 · Aug 10, 2023 · Aug 11, 2023
diff --git a/pgml-apps/pgml-chat/README.md b/pgml-apps/pgml-chat/README.md
@@ -10,10 +10,9 @@ This tool automates the above two stages and provides a command line interface t
 # Prerequisites
 Before you begin, make sure you have the following:
 
-- PostgresML Database: Spin up a for a free [GPU-powered database](https://postgresml.org/signup)
+- PostgresML Database: Sign up for a free [GPU-powered database](https://postgresml.org/signup)
 - Python version >=3.8
 - OpenAI API key
-- Python 3.8+
 
 
 # Getting started

diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/.gitignore b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/.gitignore
@@ -0,0 +1 @@
+pgml.sql
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_embeddings.py b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_embeddings.py
@@ -0,0 +1,42 @@
+import os
+import requests
+from time import time
+from rich import print
+from datasets import load_dataset
+from tqdm.auto import tqdm
+from datasets import Dataset
+from dotenv import load_dotenv
+
+load_dotenv(".env")
+
+api_org =os.environ["HF_API_KEY"]
+endpoint = os.environ["HF_ENDPOINT"]
+# add the api org token to the headers
+headers = {
+    'Authorization': f'Bearer {api_org}'
+}
+
+#squad = load_dataset("squad", split='train')
+squad = Dataset.from_file("squad-train.arrow")
+data = squad.to_pandas()
+data = data.drop_duplicates(subset=["context"])
+passages = list(data['context'])
+
+total_documents = 10000
+batch_size = 1
+passages = passages[:total_documents]
+
+start = time()
+for i in tqdm(range(0, len(passages), batch_size)):
+    # find end of batch
+    i_end = min(i+batch_size, len(passages))
+    # extract batch
+    batch = passages[i:i_end]
+    # generate embeddings for batch via endpoints
+    res = requests.post(
+        endpoint,
+        headers=headers,
+        json={"inputs": batch}
+    )
+
+print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_ingest.py b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_ingest.py
@@ -0,0 +1,70 @@
+import os
+import requests
+from time import time
+from rich import print
+from datasets import load_dataset
+import pinecone
+from tqdm.auto import tqdm
+from datasets import Dataset
+
+api_org =os.environ["HF_API_KEY"]
+endpoint = os.environ["HF_ENDPOINT"]
+# add the api org token to the headers
+headers = {
+    'Authorization': f'Bearer {api_org}'
+}
+
+#squad = load_dataset("squad", split='train')
+squad = Dataset.from_file("squad-train.arrow")
+data = squad.to_pandas()
+data = data.drop_duplicates(subset=["context"])
+passages = list(data['context'])
+
+total_documents = 10000
+batch_size = 64
+passages = passages[:total_documents]
+
+# connect to pinecone environment
+pinecone.init(
+    api_key=os.environ["PINECONE_API_KEY"],
+    environment=os.environ["PINECONE_ENVIRONMENT"]
+)
+
+index_name = 'hf-endpoints'
+
+# check if the movie-emb index exists
+if index_name not in pinecone.list_indexes():
+    # create the index if it does not exist
+    pinecone.create_index(
+        index_name,
+        dimension=dim,
+        metric="cosine"
+    )
+
+# connect to movie-emb index we created
+index = pinecone.Index(index_name)
+
+start = time()
+# we will use batches of 64
+for i in tqdm(range(0, len(passages), batch_size)):
+    # find end of batch
+    i_end = min(i+batch_size, len(passages))
+    # extract batch
+    batch = passages[i:i_end]
+    # generate embeddings for batch via endpoints
+    res = requests.post(
+        endpoint,
+        headers=headers,
+        json={"inputs": batch}
+    )
+    emb = res.json()['embeddings']
+    # get metadata (just the original text)
+    meta = [{'text': text} for text in batch]
+    # create IDs
+    ids = [str(x) for x in range(i, i_end)]
+    # add all to upsert list
+    to_upsert = list(zip(ids, emb, meta))
+    # upsert/insert these records to pinecone
+    _ = index.upsert(vectors=to_upsert)
+
+print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_query.py b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/hf_pinecone_query.py
@@ -0,0 +1,59 @@
+import os
+import requests
+from time import time
+from rich import print
+import pinecone
+from tqdm.auto import tqdm
+from datasets import Dataset
+from dotenv import load_dotenv
+from statistics import mean
+
+load_dotenv(".env")
+api_org =os.environ["HF_API_KEY"]
+endpoint = os.environ["HF_ENDPOINT"]
+# add the api org token to the headers
+headers = {
+    'Authorization': f'Bearer {api_org}'
+}
+
+#squad = load_dataset("squad", split='train')
+squad = Dataset.from_file("squad-train.arrow")
+data = squad.to_pandas()
+data = data.drop_duplicates(subset=["context"])
+passages = list(data['context'])
+
+# connect to pinecone environment
+pinecone.init(
+    api_key=os.environ["PINECONE_API_KEY"],
+    environment=os.environ["PINECONE_ENVIRONMENT"]
+)
+
+index_name = 'hf-endpoints'
+
+# check if the movie-emb index exists
+if index_name not in pinecone.list_indexes():
+    # create the index if it does not exist
+    pinecone.create_index(
+        index_name,
+        dimension=dim,
+        metric="cosine"
+    )
+
+# connect to movie-emb index we created
+index = pinecone.Index(index_name)
+
+
+run_times = []
+for query in data["context"][0:100]:
+    start = time()
+    # encode with HF endpoints
+    res = requests.post(endpoint, headers=headers, json={"inputs": query})
+    xq = res.json()['embeddings']
+    # query and return top 5
+    xc = index.query(xq, top_k=5, include_metadata=True)
+    _end = time()
+    run_times.append(_end-start)
+print("HF + Pinecone Average query time: %0.3f"%(mean(run_times)))
+
+
+
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.py b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.py
@@ -0,0 +1,21 @@
+from pgml import Database
+import os
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich import print
+import asyncio
+from tqdm.auto import tqdm
+
+async def main():
+    load_dotenv()
+    conninfo = os.environ.get("DATABASE_URL")
+    db = Database(conninfo)
+
+    collection_name = "squad_collection_benchmark"
+    collection = await db.create_or_get_collection(collection_name)
+    model_id = await collection.register_model(model_name="intfloat/e5-large")
+    await collection.generate_embeddings(model_id=model_id)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.sql b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_embeddings.sql
@@ -0,0 +1,32 @@
+DO $$
+DECLARE
+  curr_id integer := 0;
+  batch_size integer:= 2;
+  total_records integer:= 10000;
+  curr_val text[]; -- Use "text[]" instead of "varchar[]"
+  embed_result json; -- Store the result of the pgml.embed function
+BEGIN
+  LOOP
+    --BEGIN RAISE NOTICE 'updating % to %', curr_id, curr_id + batch_size; END;
+    SELECT ARRAY(SELECT chunk::text
+    FROM squad_collection_benchmark.chunks
+    WHERE id BETWEEN curr_id + 1 AND curr_id + batch_size)
+    INTO curr_val;
+
+    -- Use the correct syntax to call pgml.embed and store the result
+    PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);
+
+    curr_id := curr_id + batch_size;
+    EXIT WHEN curr_id >= total_records;
+  END LOOP;
+
+    SELECT ARRAY(SELECT chunk::text
+    FROM squad_collection_benchmark.chunks
+    WHERE id BETWEEN curr_id-batch_size AND total_records)
+    INTO curr_val;
+
+    -- Use the correct syntax to call pgml.embed and store the result
+    PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);
+
+END;
+$$;
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_ingest.py b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_ingest.py
@@ -0,0 +1,41 @@
+from pgml import Database
+import os
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich import print
+import asyncio
+from tqdm.auto import tqdm
+
+async def main():
+    load_dotenv()
+    conninfo = os.environ.get("DATABASE_URL")
+    db = Database(conninfo)
+
+    collection_name = "squad_collection_benchmark"
+    collection = await db.create_or_get_collection(collection_name)
+
+    data = load_dataset("squad", split="train")
+    data = data.to_pandas()
+    data = data.drop_duplicates(subset=["context"])
+
+    documents = [
+        {"id": r["id"], "text": r["context"], "title": r["title"]}
+        for r in data.to_dict(orient="records")
+    ]
+
+    print("Ingesting and chunking documents ..")
+    total_documents = 10000
+    batch_size = 64
+    embedding_times = []
+    total_time = 0
+    documents = documents[:total_documents]
+    for i in tqdm(range(0,len(documents),batch_size)):
+        i_end = min(i+batch_size,len(documents))
+        batch = documents[i:i_end]
+        await collection.upsert_documents(batch)
+        await collection.generate_chunks()
+    print("Ingesting and chunking completed")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_query.py b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/pgml_query.py
@@ -0,0 +1,38 @@
+from pgml import Database
+import os
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich import print
+import asyncio
+from tqdm.auto import tqdm
+from statistics import mean, median
+
+async def main():
+    load_dotenv()
+
+    conninfo = os.environ.get("DATABASE_URL")
+    db = Database(conninfo)
+
+    collection_name = "squad_collection_benchmark"
+    collection = await db.create_or_get_collection(collection_name)
+
+    data = load_dataset("squad", split="train")
+    data = data.to_pandas()
+    data = data.drop_duplicates(subset=["context"])
+    model_id = await collection.register_model(model_name="intfloat/e5-large")
+    run_times = []
+    for query in data["context"][0:100]:
+        start = time()
+        results = await collection.vector_search(query, top_k=5, model_id=model_id)
+        _end = time()
+        run_times.append(_end-start)
+    #print("PGML Query times:")
+    #print(run_times)
+    print("PGML Average query time: %0.3f"%mean(run_times))
+    print("PGML Median query time: %0.3f"%median(run_times))
+
+    #await db.archive_collection(collection_name)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/requirements.txt b/pgml-dashboard/content/blog/benchmarks/hf_pinecone_vs_postgresml/requirements.txt
@@ -0,0 +1,47 @@
+aiohttp==3.8.5
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.1.0
+black==23.7.0
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+datasets==2.14.4
+dill==0.3.7
+dnspython==2.4.2
+filelock==3.12.2
+frozenlist==1.4.0
+fsspec==2023.6.0
+huggingface-hub==0.16.4
+idna==3.4
+loguru==0.7.0
+markdown-it-py==3.0.0
+mdurl==0.1.2
+multidict==6.0.4
+multiprocess==0.70.15
+mypy-extensions==1.0.0
+numpy==1.25.2
+packaging==23.1
+pandas==2.0.3
+pathspec==0.11.2
+pgml==0.8.1
+pinecone-client==2.2.2
+platformdirs==3.10.0
+psycopg==3.1.10
+psycopg-pool==3.1.7
+pyarrow==12.0.1
+Pygments==2.16.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3
+PyYAML==6.0.1
+requests==2.31.0
+rich==13.5.2
+six==1.16.0
+tomli==2.0.1
+tqdm==4.66.1
+typing_extensions==4.7.1
+tzdata==2023.3
+urllib3==2.0.4
+xxhash==3.3.0
+yarl==1.9.2