Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pgml-apps/pgml-chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@ This tool automates the above two stages and provides a command line interface t
# Prerequisites
Before you begin, make sure you have the following:

- PostgresML Database: Spin up a for a free [GPU-powered database](https://postgresml.org/signup)
- PostgresML Database: Sign up for a free [GPU-powered database](https://postgresml.org/signup)
- Python version >=3.8
- OpenAI API key
- Python 3.8+


# Getting started
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pgml.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import requests
from time import time
from rich import print
from datasets import load_dataset
from tqdm.auto import tqdm
from datasets import Dataset
from dotenv import load_dotenv

load_dotenv(".env")

api_org =os.environ["HF_API_KEY"]
endpoint = os.environ["HF_ENDPOINT"]
# add the api org token to the headers
headers = {
'Authorization': f'Bearer {api_org}'
}

#squad = load_dataset("squad", split='train')
squad = Dataset.from_file("squad-train.arrow")
data = squad.to_pandas()
data = data.drop_duplicates(subset=["context"])
passages = list(data['context'])

total_documents = 10000
batch_size = 1
passages = passages[:total_documents]

start = time()
for i in tqdm(range(0, len(passages), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(passages))
# extract batch
batch = passages[i:i_end]
# generate embeddings for batch via endpoints
res = requests.post(
endpoint,
headers=headers,
json={"inputs": batch}
)

print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import requests
from time import time
from rich import print
from datasets import load_dataset
import pinecone
from tqdm.auto import tqdm
from datasets import Dataset

api_org =os.environ["HF_API_KEY"]
endpoint = os.environ["HF_ENDPOINT"]
# add the api org token to the headers
headers = {
'Authorization': f'Bearer {api_org}'
}

#squad = load_dataset("squad", split='train')
squad = Dataset.from_file("squad-train.arrow")
data = squad.to_pandas()
data = data.drop_duplicates(subset=["context"])
passages = list(data['context'])

total_documents = 10000
batch_size = 64
passages = passages[:total_documents]

# connect to pinecone environment
pinecone.init(
api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT"]
)

index_name = 'hf-endpoints'

# check if the movie-emb index exists
if index_name not in pinecone.list_indexes():
# create the index if it does not exist
pinecone.create_index(
index_name,
dimension=dim,
metric="cosine"
)

# connect to movie-emb index we created
index = pinecone.Index(index_name)

start = time()
# we will use batches of 64
for i in tqdm(range(0, len(passages), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(passages))
# extract batch
batch = passages[i:i_end]
# generate embeddings for batch via endpoints
res = requests.post(
endpoint,
headers=headers,
json={"inputs": batch}
)
emb = res.json()['embeddings']
# get metadata (just the original text)
meta = [{'text': text} for text in batch]
# create IDs
ids = [str(x) for x in range(i, i_end)]
# add all to upsert list
to_upsert = list(zip(ids, emb, meta))
# upsert/insert these records to pinecone
_ = index.upsert(vectors=to_upsert)

print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import requests
from time import time
from rich import print
import pinecone
from tqdm.auto import tqdm
from datasets import Dataset
from dotenv import load_dotenv
from statistics import mean

load_dotenv(".env")
api_org =os.environ["HF_API_KEY"]
endpoint = os.environ["HF_ENDPOINT"]
# add the api org token to the headers
headers = {
'Authorization': f'Bearer {api_org}'
}

#squad = load_dataset("squad", split='train')
squad = Dataset.from_file("squad-train.arrow")
data = squad.to_pandas()
data = data.drop_duplicates(subset=["context"])
passages = list(data['context'])

# connect to pinecone environment
pinecone.init(
api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT"]
)

index_name = 'hf-endpoints'

# check if the movie-emb index exists
if index_name not in pinecone.list_indexes():
# create the index if it does not exist
pinecone.create_index(
index_name,
dimension=dim,
metric="cosine"
)

# connect to movie-emb index we created
index = pinecone.Index(index_name)


run_times = []
for query in data["context"][0:100]:
start = time()
# encode with HF endpoints
res = requests.post(endpoint, headers=headers, json={"inputs": query})
xq = res.json()['embeddings']
# query and return top 5
xc = index.query(xq, top_k=5, include_metadata=True)
_end = time()
run_times.append(_end-start)
print("HF + Pinecone Average query time: %0.3f"%(mean(run_times)))



Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from pgml import Database
import os
from datasets import load_dataset
from time import time
from dotenv import load_dotenv
from rich import print
import asyncio
from tqdm.auto import tqdm

async def main():
load_dotenv()
conninfo = os.environ.get("DATABASE_URL")
db = Database(conninfo)

collection_name = "squad_collection_benchmark"
collection = await db.create_or_get_collection(collection_name)
model_id = await collection.register_model(model_name="intfloat/e5-large")
await collection.generate_embeddings(model_id=model_id)

if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
DO $$
DECLARE
curr_id integer := 0;
batch_size integer:= 2;
total_records integer:= 10000;
curr_val text[]; -- Use "text[]" instead of "varchar[]"
embed_result json; -- Store the result of the pgml.embed function
BEGIN
LOOP
--BEGIN RAISE NOTICE 'updating % to %', curr_id, curr_id + batch_size; END;
SELECT ARRAY(SELECT chunk::text
FROM squad_collection_benchmark.chunks
WHERE id BETWEEN curr_id + 1 AND curr_id + batch_size)
INTO curr_val;

-- Use the correct syntax to call pgml.embed and store the result
PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);

curr_id := curr_id + batch_size;
EXIT WHEN curr_id >= total_records;
END LOOP;

SELECT ARRAY(SELECT chunk::text
FROM squad_collection_benchmark.chunks
WHERE id BETWEEN curr_id-batch_size AND total_records)
INTO curr_val;

-- Use the correct syntax to call pgml.embed and store the result
PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);

END;
$$;
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pgml import Database
import os
from datasets import load_dataset
from time import time
from dotenv import load_dotenv
from rich import print
import asyncio
from tqdm.auto import tqdm

async def main():
load_dotenv()
conninfo = os.environ.get("DATABASE_URL")
db = Database(conninfo)

collection_name = "squad_collection_benchmark"
collection = await db.create_or_get_collection(collection_name)

data = load_dataset("squad", split="train")
data = data.to_pandas()
data = data.drop_duplicates(subset=["context"])

documents = [
{"id": r["id"], "text": r["context"], "title": r["title"]}
for r in data.to_dict(orient="records")
]

print("Ingesting and chunking documents ..")
total_documents = 10000
batch_size = 64
embedding_times = []
total_time = 0
documents = documents[:total_documents]
for i in tqdm(range(0,len(documents),batch_size)):
i_end = min(i+batch_size,len(documents))
batch = documents[i:i_end]
await collection.upsert_documents(batch)
await collection.generate_chunks()
print("Ingesting and chunking completed")

if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from pgml import Database
import os
from datasets import load_dataset
from time import time
from dotenv import load_dotenv
from rich import print
import asyncio
from tqdm.auto import tqdm
from statistics import mean, median

async def main():
load_dotenv()

conninfo = os.environ.get("DATABASE_URL")
db = Database(conninfo)

collection_name = "squad_collection_benchmark"
collection = await db.create_or_get_collection(collection_name)

data = load_dataset("squad", split="train")
data = data.to_pandas()
data = data.drop_duplicates(subset=["context"])
model_id = await collection.register_model(model_name="intfloat/e5-large")
run_times = []
for query in data["context"][0:100]:
start = time()
results = await collection.vector_search(query, top_k=5, model_id=model_id)
_end = time()
run_times.append(_end-start)
#print("PGML Query times:")
#print(run_times)
print("PGML Average query time: %0.3f"%mean(run_times))
print("PGML Median query time: %0.3f"%median(run_times))

#await db.archive_collection(collection_name)

if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
aiohttp==3.8.5
aiosignal==1.3.1
async-timeout==4.0.3
attrs==23.1.0
black==23.7.0
certifi==2023.7.22
charset-normalizer==3.2.0
click==8.1.6
datasets==2.14.4
dill==0.3.7
dnspython==2.4.2
filelock==3.12.2
frozenlist==1.4.0
fsspec==2023.6.0
huggingface-hub==0.16.4
idna==3.4
loguru==0.7.0
markdown-it-py==3.0.0
mdurl==0.1.2
multidict==6.0.4
multiprocess==0.70.15
mypy-extensions==1.0.0
numpy==1.25.2
packaging==23.1
pandas==2.0.3
pathspec==0.11.2
pgml==0.8.1
pinecone-client==2.2.2
platformdirs==3.10.0
psycopg==3.1.10
psycopg-pool==3.1.7
pyarrow==12.0.1
Pygments==2.16.1
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3
PyYAML==6.0.1
requests==2.31.0
rich==13.5.2
six==1.16.0
tomli==2.0.1
tqdm==4.66.1
typing_extensions==4.7.1
tzdata==2023.3
urllib3==2.0.4
xxhash==3.3.0
yarl==1.9.2
Loading