From e19df465f0745aef4240f5388b5ca765137397be Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 17 Mar 2025 15:58:59 -0700 Subject: [PATCH 01/13] Added basic RAG example [skip ci] --- .gitignore | 1 + README.md | 1 + examples/rag/example.py | 65 +++++++++++++++++++++++++++++++++++ examples/rag/requirements.txt | 3 ++ 4 files changed, 70 insertions(+) create mode 100644 examples/rag/example.py create mode 100644 examples/rag/requirements.txt diff --git a/.gitignore b/.gitignore index f7ff659..c55ff44 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv/ *.pyc __pycache__ .pytest_cache/ +examples/rag/README.md diff --git a/README.md b/README.md index b6bc055..24d9bb9 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ And follow the instructions for your database library: Or check out some examples: +- [Retrieval-augmented generation](https://github.com/pgvector/pgvector-python/blob/master/examples/rag/example.py) with Ollama - [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai/example.py) with OpenAI - [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere/example.py) with Cohere - [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_transformers/example.py) with SentenceTransformers diff --git a/examples/rag/example.py b/examples/rag/example.py new file mode 100644 index 0000000..4d5d307 --- /dev/null +++ b/examples/rag/example.py @@ -0,0 +1,65 @@ +# Run: +# ollama pull llama3.2 +# ollama pull nomic-embed-text +# ollama serve + +import numpy as np +import ollama +from pathlib import Path +from pgvector.psycopg import register_vector +import psycopg +import urllib.request + +query = 'What index types are supported?' +load_data = True + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +if load_data: + # get data + url = 'https://raw.githubusercontent.com/pgvector/pgvector/refs/heads/master/README.md' + dest = Path(__file__).parent / 'README.md' + if not dest.exists(): + urllib.request.urlretrieve(url, dest) + + with open(dest, encoding='utf-8') as f: + doc = f.read() + + # generate chunks + # TODO improve chunking + # TODO remove markdown + chunks = doc.split('\n## ') + + # embed chunks + # nomic-embed-text has task instruction prefix + input = ['search_document: ' + chunk for chunk in chunks] + embeddings = ollama.embed(model='nomic-embed-text', input=input).embeddings + + # create table + conn.execute('DROP TABLE IF EXISTS chunks') + conn.execute('CREATE TABLE chunks (id bigserial PRIMARY KEY, content text, embedding vector(768))') + + # store chunks + cur = conn.cursor() + with cur.copy('COPY chunks (content, embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['text', 'vector']) + + for content, embedding in zip(chunks, embeddings): + copy.write_row([content, embedding]) + +# embed query +# nomic-embed-text has task instruction prefix +input = 'search_query: ' + query +embedding = ollama.embed(model='nomic-embed-text', input=input).embeddings[0] + +# retrieve chunks +result = conn.execute('SELECT content FROM chunks ORDER BY embedding <=> %s LIMIT 5', (np.array(embedding),)).fetchall() +context = '\n\n'.join([row[0] for row in result]) + +# get answer +# TODO improve prompt +prompt = f'Answer this question: {query}\n\n{context}' +response = ollama.generate(model='llama3.2', prompt=prompt).response +print(response) diff --git a/examples/rag/requirements.txt b/examples/rag/requirements.txt new file mode 100644 index 0000000..4eb5864 --- /dev/null +++ b/examples/rag/requirements.txt @@ -0,0 +1,3 @@ +ollama +pgvector +psycopg[binary] From 1901b9cc8ab1eaf3a7415e3424509381a3399ccc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 24 Mar 2025 01:20:18 -0700 Subject: [PATCH 02/13] Improved test [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0d8d1ca..5aec977 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -43,7 +43,7 @@ def psycopg_connect(dbapi_connection, connection_record): psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): + def psycopg_async_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector_async dbapi_connection.run_async(register_vector_async) @@ -51,7 +51,7 @@ def connect(dbapi_connection, connection_record): asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') @event.listens_for(asyncpg_type_engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): + def asyncpg_connect(dbapi_connection, connection_record): from pgvector.asyncpg import register_vector dbapi_connection.run_async(register_vector) From eb654016181b69e9ed06871c39d8df329614cb66 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:42:07 -0700 Subject: [PATCH 03/13] Added ColBERT example for approximate search - #123 [skip ci] --- examples/colbert/approximate.py | 75 +++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 examples/colbert/approximate.py diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py new file mode 100644 index 0000000..0508d0f --- /dev/null +++ b/examples/colbert/approximate.py @@ -0,0 +1,75 @@ +# approach from section 3.6 in https://arxiv.org/abs/2004.12832 + +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('DROP TABLE IF EXISTS document_embeddings') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') +conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + with conn.transaction(): + result = conn.execute('INSERT INTO documents (content) VALUES (%s) RETURNING id', (content,)).fetchone() + params = [] + for embedding in embeddings: + params.extend([result[0], embedding.numpy()]) + values = ', '.join(['(%s, %s)' for _ in embeddings]) + conn.execute(f'INSERT INTO document_embeddings (document_id, embedding) VALUES {values}', params) + +conn.execute('CREATE INDEX ON document_embeddings (document_id)') +conn.execute('CREATE INDEX ON document_embeddings USING hnsw (embedding vector_cosine_ops)') + +query = 'puppy' +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query])[0]] +approximate_stage = ' UNION ALL '.join(['(SELECT document_id FROM document_embeddings ORDER BY embedding <=> %s LIMIT 5)' for _ in query_embeddings]) +sql = f""" +WITH approximate_stage AS ( + {approximate_stage} +), +embeddings AS ( + SELECT document_id, array_agg(embedding) AS embeddings FROM document_embeddings + WHERE document_id IN (SELECT DISTINCT document_id FROM approximate_stage) + GROUP BY document_id +) +SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents +INNER JOIN embeddings ON embeddings.document_id = documents.id +ORDER BY max_sim DESC LIMIT 10 +""" +params = [v for v in query_embeddings] + [query_embeddings] +result = conn.execute(sql, params).fetchall() +for row in result: + print(row) From 8718cdde9f91490b39a06293ec48d8f26193334b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:51:47 -0700 Subject: [PATCH 04/13] Updated comment [skip ci] --- examples/colbert/approximate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 0508d0f..fc1d396 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -1,4 +1,4 @@ -# approach from section 3.6 in https://arxiv.org/abs/2004.12832 +# based on section 3.6 of https://arxiv.org/abs/2004.12832 from colbert.infra import ColBERTConfig from colbert.modeling.checkpoint import Checkpoint From 123f74343b03a7910b8b66de4fc33127f4696430 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:53:10 -0700 Subject: [PATCH 05/13] Improved example [skip ci] --- examples/colbert/approximate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index fc1d396..290e66d 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -12,8 +12,10 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('DROP TABLE IF EXISTS document_embeddings') + conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') + conn.execute(""" CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ WITH queries AS ( @@ -69,7 +71,7 @@ INNER JOIN embeddings ON embeddings.document_id = documents.id ORDER BY max_sim DESC LIMIT 10 """ -params = [v for v in query_embeddings] + [query_embeddings] +params = query_embeddings + [query_embeddings] result = conn.execute(sql, params).fetchall() for row in result: print(row) From bef31a81ced1517f33c5fd960e7ba10f2fd5d8e2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:02:35 -0700 Subject: [PATCH 06/13] Improved ColBERT examples [skip ci] --- examples/colbert/approximate.py | 4 ++++ examples/colbert/exact.py | 4 ++++ examples/colbert/requirements.txt | 1 + 3 files changed, 9 insertions(+) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 290e66d..623f913 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -4,6 +4,10 @@ from colbert.modeling.checkpoint import Checkpoint from pgvector.psycopg import register_vector import psycopg +import warnings + +# ignore warnings from colbert +warnings.filterwarnings('ignore') conn = psycopg.connect(dbname='pgvector_example', autocommit=True) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index 1c90b47..ceed2e3 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -2,6 +2,10 @@ from colbert.modeling.checkpoint import Checkpoint from pgvector.psycopg import register_vector import psycopg +import warnings + +# ignore warnings from colbert +warnings.filterwarnings('ignore') conn = psycopg.connect(dbname='pgvector_example', autocommit=True) diff --git a/examples/colbert/requirements.txt b/examples/colbert/requirements.txt index 4402ce8..54b2cb9 100644 --- a/examples/colbert/requirements.txt +++ b/examples/colbert/requirements.txt @@ -1,3 +1,4 @@ colbert-ai pgvector psycopg[binary] +transformers==4.49.0 From 208b11a893c6e5a672481847251bc13a72c84165 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:08:09 -0700 Subject: [PATCH 07/13] Improved examples[skip ci] --- examples/colbert/approximate.py | 6 +++--- examples/colbert/exact.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 623f913..14f1ce0 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -6,9 +6,6 @@ import psycopg import warnings -# ignore warnings from colbert -warnings.filterwarnings('ignore') - conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') @@ -38,6 +35,9 @@ $$ LANGUAGE SQL """) +# ignore warnings from colbert +warnings.filterwarnings('ignore') + config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index ceed2e3..c1ca236 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -4,9 +4,6 @@ import psycopg import warnings -# ignore warnings from colbert -warnings.filterwarnings('ignore') - conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') @@ -32,6 +29,9 @@ $$ LANGUAGE SQL """) +# ignore warnings from colbert +warnings.filterwarnings('ignore') + config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) From 6ff9b8997e75632936230829bd557281c49e1891 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:13:23 -0700 Subject: [PATCH 08/13] Updated ColBERT examples [skip ci] --- examples/colbert/approximate.py | 3 +-- examples/colbert/exact.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 14f1ce0..41f88b2 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -35,8 +35,7 @@ $$ LANGUAGE SQL """) -# ignore warnings from colbert -warnings.filterwarnings('ignore') +warnings.filterwarnings('ignore') # ignore warnings from colbert config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index c1ca236..e6a2936 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -29,8 +29,7 @@ $$ LANGUAGE SQL """) -# ignore warnings from colbert -warnings.filterwarnings('ignore') +warnings.filterwarnings('ignore') # ignore warnings from colbert config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) From 3f9e9a20b9f08033e7dc4e61ff4c43b34951d2ec Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Apr 2025 10:01:51 -0700 Subject: [PATCH 09/13] Updated Cohere example [skip ci] --- examples/cohere/example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cohere/example.py b/examples/cohere/example.py index 393d1e0..5ef4eec 100644 --- a/examples/cohere/example.py +++ b/examples/cohere/example.py @@ -9,12 +9,12 @@ register_vector(conn) conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1024))') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))') def embed(input, input_type): - co = cohere.Client() - response = co.embed(texts=input, model='embed-english-v3.0', input_type=input_type, embedding_types=['ubinary']) + co = cohere.ClientV2() + response = co.embed(texts=input, model='embed-v4.0', input_type=input_type, embedding_types=['ubinary']) return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] From 713590a798190b34f4c43c4b097dbd61455113c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:09:36 -0700 Subject: [PATCH 10/13] Fixed SparseVector constructor for SciPy sparse matrices - fixes #127 --- CHANGELOG.md | 4 ++++ pgvector/sparsevec.py | 2 +- tests/test_sparse_vector.py | 14 +++++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e2730..1bbd73c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.1 (unreleased) + +- Fixed `SparseVector` constructor for SciPy sparse matrices + ## 0.4.0 (2025-03-15) - Added top-level `pgvector` package diff --git a/pgvector/sparsevec.py b/pgvector/sparsevec.py index 8df2dfd..895fbd0 100644 --- a/pgvector/sparsevec.py +++ b/pgvector/sparsevec.py @@ -85,7 +85,7 @@ def _from_sparse(self, value): if hasattr(value, 'coords'): # scipy 1.13+ - self._indices = value.coords[0].tolist() + self._indices = value.coords[-1].tolist() else: self._indices = value.col.tolist() self._values = value.data.tolist() diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index dff03dd..933cfff 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,7 +1,7 @@ import numpy as np from pgvector import SparseVector import pytest -from scipy.sparse import coo_array +from scipy.sparse import coo_array, csr_array, csr_matrix from struct import pack @@ -49,6 +49,18 @@ def test_dok_array(self): assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] + def test_csr_array(self): + arr = csr_array(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_csr_matrix(self): + mat = csr_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' From 76afd8ec3013ac58bb6cc60a1b5b705f157ea18b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:15:41 -0700 Subject: [PATCH 11/13] Added test for coo_matrix --- tests/test_sparse_vector.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 933cfff..0cf0a72 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,7 +1,7 @@ import numpy as np from pgvector import SparseVector import pytest -from scipy.sparse import coo_array, csr_array, csr_matrix +from scipy.sparse import coo_array, coo_matrix, csr_array, csr_matrix from struct import pack @@ -43,6 +43,12 @@ def test_coo_array_dimensions(self): SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) assert str(error.value) == 'extra argument' + def test_coo_matrix(self): + mat = coo_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_dok_array(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])).todok() vec = SparseVector(arr) From 809287f92847e1c609a9c395891da76f674379ea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:20:20 -0700 Subject: [PATCH 12/13] Fixed CI --- tests/test_sparse_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 0cf0a72..d580f32 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -56,7 +56,7 @@ def test_dok_array(self): assert vec.indices() == [0, 2, 4] def test_csr_array(self): - arr = csr_array(np.array([1, 0, 2, 0, 3, 0])) + arr = csr_array(np.array([[1, 0, 2, 0, 3, 0]])) vec = SparseVector(arr) assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] From f9d2073df5cce39f0691ead6f9e030516baac7f8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 26 Apr 2025 11:56:00 -0700 Subject: [PATCH 13/13] Version bump to 0.4.1 [skip ci] --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbd73c..0ed80e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.1 (unreleased) +## 0.4.1 (2025-04-26) - Fixed `SparseVector` constructor for SciPy sparse matrices diff --git a/pyproject.toml b/pyproject.toml index b889f4b..9395f9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.4.0" +version = "0.4.1" description = "pgvector support for Python" readme = "README.md" authors = [