From 1037d7e4c05948b6b5bfc6f8d43e0e7730224f63 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 19:37:18 -0700 Subject: [PATCH 001/123] Added test for half-precision indexing with SQLAlchemy - #98 --- tests/test_sqlalchemy.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 1ca0ea3..8a032ef 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -46,6 +46,15 @@ class Item(Base): ) index.create(engine) +half_precision_index = Index( + 'sqlalchemy_orm_half_precision_index', + func.cast(Item.embedding, HALFVEC(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'halfvec_l2_ops'} +) +half_precision_index.create(engine) + def create_items(): session = Session(engine) @@ -438,6 +447,12 @@ def test_vector_array(self): assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + def test_half_precision(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_async(self): From 32a8d04b06b0f3e77d639e9a9ed275a67fa1e36f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 19:52:49 -0700 Subject: [PATCH 002/123] Added docs for half-precision indexing with SQLAlchemy - #98 [skip ci] --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index acd625d..bbf5973 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,29 @@ index.create(engine) Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +#### Half-Precision Indexing + +Index vectors at half-precision + +```python +from pgvector.sqlalchemy import HALFVEC +from sqlalchemy.sql import func + +index = Index( + 'my_index', + func.cast(Item.embedding, HALFVEC(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +``` + +Get the nearest neighbors + +```python +session.scalars(select(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2])).limit(5)) +``` + ## SQLModel Enable the extension From 2c8fe09f824bc509ae692d2932fe1a0bc15b6923 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 20:19:06 -0700 Subject: [PATCH 003/123] Fixed example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bbf5973..10cca79 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ index = Index( func.cast(Item.embedding, HALFVEC(3)).label('embedding'), postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, - postgresql_ops={'embedding': 'vector_l2_ops'} + postgresql_ops={'embedding': 'halfvec_l2_ops'} ) ``` From 3d9ff72a270bbfedc2d579db7f2d03b4048dfbc0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 20:21:06 -0700 Subject: [PATCH 004/123] Improved example [skip ci] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 10cca79..917d1a7 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,8 @@ index = Index( Get the nearest neighbors ```python -session.scalars(select(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2])).limit(5)) +order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2]) +session.scalars(select(Item).order_by(order).limit(5)) ``` ## SQLModel From 25a30264599c9646b743efdfe1d28b99d6208f90 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 3 Nov 2024 19:30:28 -0800 Subject: [PATCH 005/123] Updated test [skip ci] --- tests/test_django.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_django.py b/tests/test_django.py index 5ab5f81..92f3733 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -65,7 +65,7 @@ class Meta: name='hnsw_idx', fields=['embedding'], m=16, - ef_construction=100, + ef_construction=64, opclasses=['vector_l2_ops'] ) ] From 06a48c4699486b3dc2ab843104594d0fc4539038 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 09:12:53 -0800 Subject: [PATCH 006/123] Added pool example and tests for Psycopg 3 - closes #100 --- README.md | 9 +++++++++ requirements.txt | 2 +- tests/test_psycopg.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 917d1a7..0989ba7 100644 --- a/README.md +++ b/README.md @@ -338,6 +338,15 @@ from pgvector.psycopg import register_vector register_vector(conn) ``` +For [connection pools](https://www.psycopg.org/psycopg3/docs/advanced/pool.html), use + +```python +def configure(conn): + register_vector(conn) + +pool = ConnectionPool(configure=configure) +``` + For [async connections](https://www.psycopg.org/psycopg3/docs/advanced/async.html), use ```python diff --git a/requirements.txt b/requirements.txt index c1e11f3..0e30959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ asyncpg Django numpy peewee -psycopg[binary] +psycopg[binary,pool] psycopg2-binary pytest pytest-asyncio diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index c4e1c22..5802b2b 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,6 +1,7 @@ import numpy as np from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector, Vector import psycopg +from psycopg_pool import ConnectionPool, AsyncConnectionPool import pytest conn = psycopg.connect(dbname='pgvector_python_test', autocommit=True) @@ -176,6 +177,18 @@ def test_vector_array(self): assert np.array_equal(res[0][0], embeddings[0]) assert np.array_equal(res[0][1], embeddings[1]) + def test_pool(self): + def configure(conn): + register_vector(conn) + + pool = ConnectionPool(conninfo='postgres://localhost/pgvector_python_test', open=True, configure=configure) + + with pool.connection() as conn: + res = conn.execute("SELECT '[1,2,3]'::vector").fetchone() + assert np.array_equal(res[0], np.array([1, 2, 3])) + + pool.close() + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) @@ -195,3 +208,19 @@ async def test_async(self): assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 assert res[1][1] is None + + @pytest.mark.asyncio + async def test_async_pool(self): + async def configure(conn): + await register_vector_async(conn) + + pool = AsyncConnectionPool(conninfo='postgres://localhost/pgvector_python_test', open=False, configure=configure) + await pool.open() + + async with pool.connection() as conn: + async with conn.cursor() as cur: + await cur.execute("SELECT '[1,2,3]'::vector") + res = await cur.fetchone() + assert np.array_equal(res[0], np.array([1, 2, 3])) + + await pool.close() From 49072f2e37ff97b07b422aa0a41c4d3bd312879f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 09:33:14 -0800 Subject: [PATCH 007/123] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0989ba7..37a4737 100644 --- a/README.md +++ b/README.md @@ -344,7 +344,7 @@ For [connection pools](https://www.psycopg.org/psycopg3/docs/advanced/pool.html) def configure(conn): register_vector(conn) -pool = ConnectionPool(configure=configure) +pool = ConnectionPool(..., configure=configure) ``` For [async connections](https://www.psycopg.org/psycopg3/docs/advanced/async.html), use From 9e1c421c62a84f650cf8be73e0768bfab6715e7b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:18:41 -0800 Subject: [PATCH 008/123] Added docs and test for half-precision indexing with Django --- README.md | 25 +++++++++++++++++++++++++ tests/test_django.py | 25 ++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 37a4737..938207a 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,31 @@ class Item(models.Model): Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +#### Half-Precision Indexing + +Index vectors at half-precision + +```python +from django.contrib.postgres.indexes import OpClass +from django.db.models.functions import Cast +from pgvector.django import HalfVectorField + +index = HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='my_index', + m=16, + ef_construction=64 +) +``` + +Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` + +Get the nearest neighbors + +```python +Item.objects.order_by(L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]))[:5] +``` + ## SQLAlchemy Enable the extension diff --git a/tests/test_django.py b/tests/test_django.py index 92f3733..353087e 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -1,6 +1,7 @@ import django from django.conf import settings from django.contrib.postgres.fields import ArrayField +from django.contrib.postgres.indexes import OpClass from django.core import serializers from django.db import connection, migrations, models from django.db.models import Avg, Sum, FloatField, DecimalField @@ -38,7 +39,12 @@ 'level': 'WARNING' } } - } + }, + # needed for OpClass + # https://docs.djangoproject.com/en/5.1/ref/contrib/postgres/indexes/#opclass-expressions + INSTALLED_APPS=[ + 'django.contrib.postgres' + ] ) django.setup() @@ -67,6 +73,12 @@ class Meta: m=16, ef_construction=64, opclasses=['vector_l2_ops'] + ), + HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='hnsw_half_precision_idx', + m=16, + ef_construction=64 ) ] @@ -99,6 +111,10 @@ class Migration(migrations.Migration): migrations.AddIndex( model_name='item', index=pgvector.django.HnswIndex(fields=['embedding'], m=16, ef_construction=64, name='hnsw_idx', opclasses=['vector_l2_ops']), + ), + migrations.AddIndex( + model_name='item', + index=pgvector.django.HnswIndex(OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), m=16, ef_construction=64, name='hnsw_half_precision_idx'), ) ] @@ -473,3 +489,10 @@ def test_numeric_array(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] assert items[1].numeric_embedding == [1, 1, 2] + + def test_half_precision(self): + create_items() + distance = L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] From afcd67a4422dd390e07662bfe81a8fbfab571301 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:23:17 -0800 Subject: [PATCH 009/123] Updated readme [skip ci] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 938207a..1000900 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,8 @@ Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` Get the nearest neighbors ```python -Item.objects.order_by(L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]))[:5] +distance = L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]) +Item.objects.order_by(distance)[:5] ``` ## SQLAlchemy From 441b26ec3dfbdfb6013ffdf18df083614d9fc5ff Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:30:19 -0800 Subject: [PATCH 010/123] Updated example [skip ci] --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1000900..b61059e 100644 --- a/README.md +++ b/README.md @@ -142,12 +142,16 @@ from django.contrib.postgres.indexes import OpClass from django.db.models.functions import Cast from pgvector.django import HalfVectorField -index = HnswIndex( - OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), - name='my_index', - m=16, - ef_construction=64 -) +class Item(models.Model): + class Meta: + indexes = [ + HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='my_index', + m=16, + ef_construction=64 + ) + ] ``` Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` From 78e64594843f2eb833cca77d8f43c33971806963 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:31:05 -0800 Subject: [PATCH 011/123] Updated example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b61059e..94fed55 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ Index vectors at half-precision ```python from django.contrib.postgres.indexes import OpClass from django.db.models.functions import Cast -from pgvector.django import HalfVectorField +from pgvector.django import HnswIndex, HalfVectorField class Item(models.Model): class Meta: From 75e14d80c80975938c0f1c64f59901686f1cd24b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:02:54 -0800 Subject: [PATCH 012/123] Added pool test for Psycopg 2 --- tests/test_psycopg2.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index c93fce4..85f08aa 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -2,6 +2,7 @@ from pgvector.psycopg2 import register_vector, HalfVector, SparseVector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor +from psycopg2.pool import ThreadedConnectionPool conn = psycopg2.connect(dbname='pgvector_python_test') conn.autocommit = True @@ -94,3 +95,21 @@ def test_cursor_factory_connection(self): conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) register_vector(conn, globally=False) conn.close() + + def test_pool(self): + pool = ThreadedConnectionPool(1, 3, dbname='pgvector_python_test') + + conn = pool.getconn() + try: + cur = conn.cursor() + + # use globally=True for apps + register_vector(cur, globally=False) + + cur.execute("SELECT '[1,2,3]'::vector") + res = cur.fetchone() + assert np.array_equal(res[0], np.array([1, 2, 3])) + finally: + pool.putconn(conn) + + pool.closeall() From 706cebcb4c10f5fc6288757744fcfe94cb461a0b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:04:39 -0800 Subject: [PATCH 013/123] Improved test --- tests/test_psycopg2.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 85f08aa..3730eb8 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -101,11 +101,14 @@ def test_pool(self): conn = pool.getconn() try: - cur = conn.cursor() - # use globally=True for apps - register_vector(cur, globally=False) + register_vector(conn, globally=False) + finally: + pool.putconn(conn) + conn = pool.getconn() + try: + cur = conn.cursor() cur.execute("SELECT '[1,2,3]'::vector") res = cur.fetchone() assert np.array_equal(res[0], np.array([1, 2, 3])) From 812a85e7ce40d42382d84244b25e2f44eddf2e94 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:04:59 -0800 Subject: [PATCH 014/123] Improved test [skip ci] --- tests/test_psycopg2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 3730eb8..3f52385 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -97,7 +97,7 @@ def test_cursor_factory_connection(self): conn.close() def test_pool(self): - pool = ThreadedConnectionPool(1, 3, dbname='pgvector_python_test') + pool = ThreadedConnectionPool(1, 1, dbname='pgvector_python_test') conn = pool.getconn() try: From 07a3b2b6eec65d332041dcec136ac9c75291bc2b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:09:23 -0800 Subject: [PATCH 015/123] Updated comment [skip ci] --- tests/test_psycopg2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 3f52385..c3cd3cd 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -101,7 +101,7 @@ def test_pool(self): conn = pool.getconn() try: - # use globally=True for apps + # use globally=True for apps to ensure registered with all connections register_vector(conn, globally=False) finally: pool.putconn(conn) From ea32504ef8538c781fab1f579fcaec7b417b7163 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 11 Nov 2024 20:56:23 -0800 Subject: [PATCH 016/123] Updated pgvector on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f8bcaa3..04f1c21 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.7.0 https://github.com/pgvector/pgvector.git + git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install From 664b8ee8692a42236ff9b236ec2da635342b96c7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 18:28:01 -0800 Subject: [PATCH 017/123] Added test for halfvec arrays with SQLAlchemy - #101 --- tests/test_sqlalchemy.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 8a032ef..c9aa900 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -32,6 +32,7 @@ class Item(Base): binary_embedding = mapped_column(BIT(3)) sparse_embedding = mapped_column(SPARSEVEC(3)) embeddings = mapped_column(ARRAY(VECTOR(3))) + half_embeddings = mapped_column(ARRAY(HALFVEC(3))) Base.metadata.drop_all(engine) @@ -447,6 +448,20 @@ def test_vector_array(self): assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + def test_halfvec_array(self): + session = Session(engine) + session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() + + with engine.connect() as connection: + from pgvector.psycopg2 import register_vector + register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) + + # this fails if the driver does not cast arrays + item = Session(bind=connection).get(Item, 1) + assert item.half_embeddings[0].to_list() == [1, 2, 3] + assert item.half_embeddings[1].to_list() == [4, 5, 6] + def test_half_precision(self): create_items() with Session(engine) as session: From 1c7e6a5fb3ea31512dacf71eaf4165eae9fa60e8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 19:04:21 -0800 Subject: [PATCH 018/123] Added docs for arrays with SQLAlchemy [skip ci] --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 94fed55..44f1d93 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,27 @@ order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2]) session.scalars(select(Item).order_by(order).limit(5)) ``` +#### Arrays + +Add an array column + +```python +from pgvector.sqlalchemy import Vector +from sqlalchemy import ARRAY + +class Item(Base): + embeddings = mapped_column(ARRAY(Vector(3))) +``` + +And register the types with the underlying driver + +```python +from pgvector.psycopg2 import register_vector + +with engine.connect() as connection: + register_vector(connection.connection.dbapi_connection, globally=True, arrays=True) +``` + ## SQLModel Enable the extension From 0a760663b1acd993c7caf364c8c087c50306a01f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 20:42:27 -0800 Subject: [PATCH 019/123] Use connection from session in example and tests --- README.md | 2 +- tests/test_sqlalchemy.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 44f1d93..757ade9 100644 --- a/README.md +++ b/README.md @@ -285,7 +285,7 @@ And register the types with the underlying driver ```python from pgvector.psycopg2 import register_vector -with engine.connect() as connection: +with session.connection() as connection: register_vector(connection.connection.dbapi_connection, globally=True, arrays=True) ``` diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index c9aa900..57cc12b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -439,12 +439,12 @@ def test_vector_array(self): session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with engine.connect() as connection: + with session.connection() as connection: from pgvector.psycopg2 import register_vector register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) # this fails if the driver does not cast arrays - item = Session(bind=connection).get(Item, 1) + item = session.get(Item, 1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] @@ -453,12 +453,12 @@ def test_halfvec_array(self): session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with engine.connect() as connection: + with session.connection() as connection: from pgvector.psycopg2 import register_vector register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) # this fails if the driver does not cast arrays - item = Session(bind=connection).get(Item, 1) + item = session.get(Item, 1) assert item.half_embeddings[0].to_list() == [1, 2, 3] assert item.half_embeddings[1].to_list() == [4, 5, 6] From 030def94b19329fa29c71f5273183f82c0550fd3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 20:55:15 -0800 Subject: [PATCH 020/123] Improved example and tests for arrays with SQLAlchemy - #101 [skip ci] --- README.md | 6 ++++-- tests/test_sqlalchemy.py | 39 ++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 757ade9..991c51f 100644 --- a/README.md +++ b/README.md @@ -284,9 +284,11 @@ And register the types with the underlying driver ```python from pgvector.psycopg2 import register_vector +from sqlalchemy import engine -with session.connection() as connection: - register_vector(connection.connection.dbapi_connection, globally=True, arrays=True) +@event.listens_for(engine, "connect") +def connect(dbapi_connection, connection_record): + register_vector(dbapi_connection, arrays=True) ``` ## SQLModel diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 57cc12b..f8e4bb1 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest -from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY +from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY from sqlalchemy.exc import StatementError from sqlalchemy.ext.automap import automap_base from sqlalchemy.orm import declarative_base, Session @@ -20,6 +20,15 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() +array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') + + +@event.listens_for(array_engine, "connect") +def connect(dbapi_connection, connection_record): + from pgvector.psycopg2 import register_vector + register_vector(dbapi_connection, globally=False, arrays=True) + + Base = declarative_base() @@ -435,32 +444,24 @@ def test_automap(self): assert item.embedding.tolist() == [1, 2, 3] def test_vector_array(self): - session = Session(engine) + session = Session(array_engine) session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with session.connection() as connection: - from pgvector.psycopg2 import register_vector - register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) - - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] def test_halfvec_array(self): - session = Session(engine) + session = Session(array_engine) session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with session.connection() as connection: - from pgvector.psycopg2 import register_vector - register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) - - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.half_embeddings[0].to_list() == [1, 2, 3] - assert item.half_embeddings[1].to_list() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.half_embeddings[0].to_list() == [1, 2, 3] + assert item.half_embeddings[1].to_list() == [4, 5, 6] def test_half_precision(self): create_items() From d23844ef10dcd4297a9e2f3671ed8e851e0a2db1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 21:00:22 -0800 Subject: [PATCH 021/123] Fixed example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 991c51f..516f3aa 100644 --- a/README.md +++ b/README.md @@ -284,7 +284,7 @@ And register the types with the underlying driver ```python from pgvector.psycopg2 import register_vector -from sqlalchemy import engine +from sqlalchemy import event @event.listens_for(engine, "connect") def connect(dbapi_connection, connection_record): From 04aa5bca2ee60c73de91507e5eb7472a6cf6d7a6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 22:39:46 -0800 Subject: [PATCH 022/123] Added test for arrays with SQLAlchemy async - #101 --- tests/test_sqlalchemy.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f8e4bb1..77c03fc 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -483,3 +483,25 @@ async def test_async(self): assert avg.first() == '[2.5,3.5,4.5]' await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_async_vector_array(self): + engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + @event.listens_for(engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) + + async with async_session() as session: + async with session.begin(): + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + + # this fails if the driver does not cast arrays + item = await session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] + + await engine.dispose() From dbc44f4533e9edaa376dc4d4a18fea235c5e2187 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 22:48:46 -0800 Subject: [PATCH 023/123] Added more examples for arrays with SQLAlchemy [skip ci] --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 516f3aa..0dedce9 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,30 @@ class Item(Base): And register the types with the underlying driver +For Psycopg 3, use + +```python +from pgvector.psycopg import register_vector +from sqlalchemy import event + +@event.listens_for(engine, "connect") +def connect(dbapi_connection, connection_record): + register_vector(dbapi_connection) +``` + +For [async connections](https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html) with Psycopg 3, use + +```python +from pgvector.psycopg import register_vector_async +from sqlalchemy import event + +@event.listens_for(engine.sync_engine, "connect") +def connect(dbapi_connection, connection_record): + dbapi_connection.run_async(register_vector_async) +``` + +For Psycopg 2, use + ```python from pgvector.psycopg2 import register_vector from sqlalchemy import event From 368b363bbf9a48fe42bc114991c4e97ee140cdeb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:09:06 -0800 Subject: [PATCH 024/123] Added ColBERT example for binary embeddings [skip ci] --- examples/colbert/exact_binary.py | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 examples/colbert/exact_binary.py diff --git a/examples/colbert/exact_binary.py b/examples/colbert/exact_binary.py new file mode 100644 index 0000000..8d398e2 --- /dev/null +++ b/examples/colbert/exact_binary.py @@ -0,0 +1,53 @@ +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector, Bit +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embeddings bit(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + + +def binary_quantize(embeddings): + return [Bit(e.numpy()) for e in (embeddings > 0)] + + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + embeddings = binary_quantize(embeddings) + conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) + +query = 'puppy' +query_embeddings = binary_quantize(checkpoint.queryFromText([query])[0]) +result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) From 267d7960156b6866c300229a10b79b89d670ea39 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:41:25 -0800 Subject: [PATCH 025/123] Added ColPali / ColQwen2 example [skip ci] --- README.md | 1 + examples/colpali/exact.py | 52 +++++++++++++++++++++++++++++++ examples/colpali/requirements.txt | 4 +++ 3 files changed, 57 insertions(+) create mode 100644 examples/colpali/exact.py create mode 100644 examples/colpali/requirements.txt diff --git a/README.md b/README.md index 0dedce9..224fe57 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Or check out some examples: - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder) - [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers - [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT +- [Document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing - [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py new file mode 100644 index 0000000..408bc7f --- /dev/null +++ b/examples/colpali/exact.py @@ -0,0 +1,52 @@ +from colpali_engine.models import ColQwen2, ColQwen2Processor +from datasets import load_dataset +from pgvector.psycopg import register_vector +import psycopg +import torch + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings vector(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + + +device = 'mps' if torch.backends.mps.is_available() else 'cpu' +model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() +processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') + + +def generate_embeddings(processed): + with torch.no_grad(): + return model(**processed.to(model.device)).to(device='cpu', dtype=torch.float32) + + +input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] +for content in input: + embeddings = [e.numpy() for e in generate_embeddings(processor.process_images([content]))[0]] + conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) + +query = 'dividend' +query_embeddings = [e.numpy() for e in generate_embeddings(processor.process_queries([query]))[0]] +result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) diff --git a/examples/colpali/requirements.txt b/examples/colpali/requirements.txt new file mode 100644 index 0000000..4cf770d --- /dev/null +++ b/examples/colpali/requirements.txt @@ -0,0 +1,4 @@ +colpali-engine +datasets +pgvector +psycopg[binary] From 7d8a4173d988b5e9debaba7b4d6320d61879e76c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:47:10 -0800 Subject: [PATCH 026/123] Updated ColPali example to use binary quantization [skip ci] --- examples/colbert/exact_binary.py | 53 -------------------------------- examples/colpali/exact.py | 16 ++++++---- 2 files changed, 10 insertions(+), 59 deletions(-) delete mode 100644 examples/colbert/exact_binary.py diff --git a/examples/colbert/exact_binary.py b/examples/colbert/exact_binary.py deleted file mode 100644 index 8d398e2..0000000 --- a/examples/colbert/exact_binary.py +++ /dev/null @@ -1,53 +0,0 @@ -from colbert.infra import ColBERTConfig -from colbert.modeling.checkpoint import Checkpoint -from pgvector.psycopg import register_vector, Bit -import psycopg - -conn = psycopg.connect(dbname='pgvector_example', autocommit=True) - -conn.execute('CREATE EXTENSION IF NOT EXISTS vector') -register_vector(conn) - -conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embeddings bit(128)[])') -conn.execute(""" -CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ - WITH queries AS ( - SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) - ), - documents AS ( - SELECT unnest(document) AS document - ), - similarities AS ( - SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents - ), - max_similarities AS ( - SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number - ) - SELECT SUM(max_similarity) FROM max_similarities -$$ LANGUAGE SQL -""") - - -def binary_quantize(embeddings): - return [Bit(e.numpy()) for e in (embeddings > 0)] - - -config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) -checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) - -input = [ - 'The dog is barking', - 'The cat is purring', - 'The bear is growling' -] -doc_embeddings = checkpoint.docFromText(input, keep_dims=False) -for content, embeddings in zip(input, doc_embeddings): - embeddings = binary_quantize(embeddings) - conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) - -query = 'puppy' -query_embeddings = binary_quantize(checkpoint.queryFromText([query])[0]) -result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() -for row in result: - print(row) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 408bc7f..9fffc5f 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -1,6 +1,6 @@ from colpali_engine.models import ColQwen2, ColQwen2Processor from datasets import load_dataset -from pgvector.psycopg import register_vector +from pgvector.psycopg import register_vector, Bit import psycopg import torch @@ -10,9 +10,9 @@ register_vector(conn) conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings vector(128)[])') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings bit(128)[])') conn.execute(""" -CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ +CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ WITH queries AS ( SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) ), @@ -20,7 +20,7 @@ SELECT unnest(document) AS document ), similarities AS ( - SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents ), max_similarities AS ( SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number @@ -40,13 +40,17 @@ def generate_embeddings(processed): return model(**processed.to(model.device)).to(device='cpu', dtype=torch.float32) +def binary_quantize(embedding): + return Bit(embedding > 0) + + input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] for content in input: - embeddings = [e.numpy() for e in generate_embeddings(processor.process_images([content]))[0]] + embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_images([content]))[0]] conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) query = 'dividend' -query_embeddings = [e.numpy() for e in generate_embeddings(processor.process_queries([query]))[0]] +query_embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_queries([query]))[0]] result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() for row in result: print(row) From d73a412de5fcb6d225b4d90865f0c4e514a142d3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:54:52 -0800 Subject: [PATCH 027/123] Updated ColPali example to use get_torch_device [skip ci] --- examples/colpali/exact.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 9fffc5f..6eac7a4 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -1,4 +1,5 @@ from colpali_engine.models import ColQwen2, ColQwen2Processor +from colpali_engine.utils.torch_utils import get_torch_device from datasets import load_dataset from pgvector.psycopg import register_vector, Bit import psycopg @@ -30,7 +31,7 @@ """) -device = 'mps' if torch.backends.mps.is_available() else 'cpu' +device = get_torch_device('auto') model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') From 7b6a46a014144f05ba174a53510ed69fd113b100 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:55:26 -0800 Subject: [PATCH 028/123] Removed extra line [skip ci] --- examples/colpali/exact.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 6eac7a4..06d1828 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -30,7 +30,6 @@ $$ LANGUAGE SQL """) - device = get_torch_device('auto') model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') From 4998aa1daefe95eb7550bc92ed875d6193d73b57 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 16:19:48 -0800 Subject: [PATCH 029/123] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 224fe57..260d389 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Or check out some examples: - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder) - [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers - [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT -- [Document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali +- [Visual document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing - [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit From af7b04f271612c3f663e9a508f9c44564272e3a8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 16:40:44 -0800 Subject: [PATCH 030/123] Updated example [skip ci] --- examples/colpali/exact.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 06d1828..c6f1467 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -37,20 +37,20 @@ def generate_embeddings(processed): with torch.no_grad(): - return model(**processed.to(model.device)).to(device='cpu', dtype=torch.float32) + return model(**processed.to(model.device)) def binary_quantize(embedding): - return Bit(embedding > 0) + return Bit(embedding.gt(0).numpy(force=True)) input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] for content in input: - embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_images([content]))[0]] + embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_images([content]))[0]] conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) query = 'dividend' -query_embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_queries([query]))[0]] +query_embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_queries([query]))[0]] result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() for row in result: print(row) From ca637bff37674592f08b9f65c75249b0d709746e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 16:44:27 -0800 Subject: [PATCH 031/123] Updated example [skip ci] --- examples/colpali/exact.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index c6f1467..80bb603 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -37,11 +37,11 @@ def generate_embeddings(processed): with torch.no_grad(): - return model(**processed.to(model.device)) + return model(**processed.to(model.device)).to(torch.float32).numpy(force=True) def binary_quantize(embedding): - return Bit(embedding.gt(0).numpy(force=True)) + return Bit(embedding > 0) input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] From 5c35a5399aa3101f35e09e941a4e7cce0218e1ef Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 30 Nov 2024 03:30:34 -0800 Subject: [PATCH 032/123] Added test for binary quantization with SQLAlchemy - #98 [skip ci] --- tests/test_sqlalchemy.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 77c03fc..0380c89 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -65,6 +65,14 @@ class Item(Base): ) half_precision_index.create(engine) +binary_quantize_index = Index( + 'sqlalchemy_orm_binary_quantize_index', + func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'bit_hamming_ops'} +) +binary_quantize_index.create(engine) def create_items(): session = Session(engine) @@ -469,6 +477,18 @@ def test_half_precision(self): items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] + def test_binary_quantize(self): + session = Session(engine) + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + + with Session(engine) as session: + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) + items = session.query(Item).order_by(distance).all() + assert [v.id for v in items] == [2, 3, 1] + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_async(self): From 230fe853d58105df1951fbbbc1730469b341f056 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 19 Dec 2024 11:59:52 -0800 Subject: [PATCH 033/123] Fixed spacing [skip ci] --- tests/test_sqlalchemy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0380c89..9ab706a 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -74,6 +74,7 @@ class Item(Base): ) binary_quantize_index.create(engine) + def create_items(): session = Session(engine) session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) From 057806e44f6943230699d99c742621baeb9023c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 19 Dec 2024 12:00:52 -0800 Subject: [PATCH 034/123] Added test for bit type with SQLAlchemy and asyncpg - #110 --- tests/test_sqlalchemy.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 9ab706a..0b53252 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -526,3 +526,20 @@ def connect(dbapi_connection, connection_record): assert item.embeddings[1].tolist() == [4, 5, 6] await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_bit(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + embedding = asyncpg.BitString('101') + session.add(Item(id=1, binary_embedding=embedding)) + item = await session.get(Item, 1) + assert item.binary_embedding == embedding + + await engine.dispose() From 57b6a61149c1f009ae55cccc17a9b5900e335f72 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 19 Dec 2024 12:06:02 -0800 Subject: [PATCH 035/123] Improved asyncpg test [skip ci] --- tests/test_asyncpg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 7a68a9e..48d1e32 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -59,10 +59,11 @@ async def test_bit(self): await register_vector(conn) - embedding = asyncpg.BitString.from_int(5, length=3) + embedding = asyncpg.BitString('101') await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert res[0]['embedding'].as_string() == '101' assert res[0]['embedding'].to_int() == 5 assert res[1]['embedding'] is None From 47ad76d88f72cf07ffa238e4ad2714b672346149 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:35:43 -0800 Subject: [PATCH 036/123] Improved SQLModel example --- README.md | 3 +-- tests/test_sqlmodel.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 260d389..ca399ea 100644 --- a/README.md +++ b/README.md @@ -328,10 +328,9 @@ Add a vector column ```python from pgvector.sqlalchemy import Vector -from sqlalchemy import Column class Item(SQLModel, table=True): - embedding: Any = Field(sa_column=Column(Vector(3))) + embedding: Any = Field(sa_type=Vector(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 4cb0e9b..8a1c86c 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -15,10 +15,10 @@ class Item(SQLModel, table=True): __tablename__ = 'sqlmodel_item' id: Optional[int] = Field(default=None, primary_key=True) - embedding: Optional[Any] = Field(default=None, sa_column=Column(VECTOR(3))) - half_embedding: Optional[Any] = Field(default=None, sa_column=Column(HALFVEC(3))) - binary_embedding: Optional[Any] = Field(default=None, sa_column=Column(BIT(3))) - sparse_embedding: Optional[Any] = Field(default=None, sa_column=Column(SPARSEVEC(3))) + embedding: Optional[Any] = Field(default=None, sa_type=VECTOR(3)) + half_embedding: Optional[Any] = Field(default=None, sa_type=HALFVEC(3)) + binary_embedding: Optional[Any] = Field(default=None, sa_type=BIT(3)) + sparse_embedding: Optional[Any] = Field(default=None, sa_type=SPARSEVEC(3)) SQLModel.metadata.drop_all(engine) From b3e8908d3b3d74eba016b1cdc5bc7b1df1ad92bf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:36:07 -0800 Subject: [PATCH 037/123] Removed unneeded code [skip ci] --- tests/test_django.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 353087e..2c53d82 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -86,9 +86,6 @@ class Meta: class Migration(migrations.Migration): initial = True - dependencies = [ - ] - operations = [ VectorExtension(), migrations.CreateModel( From edd9b4ba02160ef429c4e44455eb0bfe6c781092 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:37:05 -0800 Subject: [PATCH 038/123] Test with Python 3.13 on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 04f1c21..562ba94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - python: [3.12, 3.8] + python: [3.13, 3.8] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 From 1b9df46f9542f3262f6c93a1a858c1414a0ffdc5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:38:38 -0800 Subject: [PATCH 039/123] Improved test code [skip ci] --- tests/test_sqlmodel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 8a1c86c..851afd8 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,9 +1,8 @@ import numpy as np from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest -from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError -from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text +from sqlmodel import Field, Index, Session, SQLModel, create_engine, delete, select, text from typing import Any, Optional engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') From a3c611f3f141a00c42b311f387278bb4f3ee4bcf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:42:16 -0800 Subject: [PATCH 040/123] Updated examples [skip ci] --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ca399ea..d7a7e6c 100644 --- a/README.md +++ b/README.md @@ -175,10 +175,10 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR class Item(Base): - embedding = mapped_column(Vector(3)) + embedding = mapped_column(VECTOR(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` @@ -274,11 +274,11 @@ session.scalars(select(Item).order_by(order).limit(5)) Add an array column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR from sqlalchemy import ARRAY class Item(Base): - embeddings = mapped_column(ARRAY(Vector(3))) + embeddings = mapped_column(ARRAY(VECTOR(3))) ``` And register the types with the underlying driver @@ -327,10 +327,10 @@ session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR class Item(SQLModel, table=True): - embedding: Any = Field(sa_type=Vector(3)) + embedding: Any = Field(sa_type=VECTOR(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` From 2ba2a855164f6f0947f17b94201a46d5ad615e6c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 20:00:48 -0800 Subject: [PATCH 041/123] Improved example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d7a7e6c..794cf91 100644 --- a/README.md +++ b/README.md @@ -376,7 +376,7 @@ Also supports `sum` Add an approximate index ```python -from sqlalchemy import Index +from sqlmodel import Index index = Index( 'my_index', From c6d2ddd429c10316ef329dd07ab86fe192bc71a0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 22:09:23 -0800 Subject: [PATCH 042/123] Improved sparsevec tests [skip ci] --- pgvector/utils/sparsevec.py | 1 + tests/test_psycopg.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index fd9ccff..a370c5e 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -108,6 +108,7 @@ def from_binary(cls, value): dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) + # TODO convert indices and values to lists in 0.4.0 return cls._from_parts(int(dim), indices, values) @classmethod diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 5802b2b..6d4f34a 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -110,12 +110,19 @@ def test_sparsevec(self): def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] + assert res.dimensions() == 6 + # TODO convert indices and values to lists in 0.4.0 + assert res.indices() == (0, 2, 4) + assert res.values() == (1.5, 2, 3) assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] + assert res.dimensions() == 6 + assert res.indices() == [0, 2, 4] + assert res.values() == [1.5, 2, 3] assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) From 9d9f45b800f3731e213f7b06bf3374e177ad86d5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 22:11:22 -0800 Subject: [PATCH 043/123] Added todo [skip ci] --- pgvector/psycopg2/register.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 7752852..08a69a9 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,6 +5,7 @@ from .vector import register_vector_info +# TODO remove default value for conn_or_curs in 0.4.0 # TODO make globally False by default in 0.4.0 # note: register_adapter is always global # TODO make arrays True by defalt in 0.4.0 From 972b6739788f5a09ec270bed552182a052e994c5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 22:18:26 -0800 Subject: [PATCH 044/123] Updated license year [skip ci] --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index d205f4e..b612d6d 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2021-2024 Andrew Kane +Copyright (c) 2021-2025 Andrew Kane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 57b7d3ba12781871045a378221d90bc972a3d5c1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:26:02 -0800 Subject: [PATCH 045/123] Added test for vector type with SQLAlchemy and asyncpg - #114 --- tests/test_sqlalchemy.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0b53252..6fc0adf 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -527,6 +527,29 @@ def connect(dbapi_connection, connection_record): await engine.dispose() + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_vector(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + embedding = np.array([1, 2, 3]) + session.add(Item(id=1, embedding=embedding)) + item = await session.get(Item, 1) + assert np.array_equal(item.embedding, embedding) + + await engine.dispose() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_bit(self): From bf9a0a469983eabb1b1b38c6ba2495e3c4c2b8ce Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:29:37 -0800 Subject: [PATCH 046/123] Added tests for halfvec and sparsevec types with SQLAlchemy and asyncpg [skip ci] --- tests/test_sqlalchemy.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 6fc0adf..40068e9 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -550,6 +550,29 @@ async def test_asyncpg_vector(self): await engine.dispose() + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_halfvec(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + embedding = [1, 2, 3] + session.add(Item(id=1, half_embedding=embedding)) + item = await session.get(Item, 1) + assert item.half_embedding.to_list() == embedding + + await engine.dispose() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_bit(self): @@ -566,3 +589,26 @@ async def test_asyncpg_bit(self): assert item.binary_embedding == embedding await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_sparsevec(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + embedding = [1, 2, 3] + session.add(Item(id=1, sparse_embedding=embedding)) + item = await session.get(Item, 1) + assert item.sparse_embedding.to_list() == embedding + + await engine.dispose() From 257eb3b92c9f02e2ca266a15c6c8b93ebc94082a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:30:55 -0800 Subject: [PATCH 047/123] Simplified tests [skip ci] --- tests/test_sqlalchemy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 40068e9..519a388 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -530,8 +530,6 @@ def connect(dbapi_connection, connection_record): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_vector(self): - import asyncpg - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -553,8 +551,6 @@ async def test_asyncpg_vector(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_halfvec(self): - import asyncpg - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -593,8 +589,6 @@ async def test_asyncpg_bit(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_sparsevec(self): - import asyncpg - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From 91f5d34c11f0064c83ca08b7e69055ce6ef03124 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:32:53 -0800 Subject: [PATCH 048/123] Added test for vector[] type with SQLAlchemy and asyncpg [skip ci] --- tests/test_sqlalchemy.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 519a388..7e8b888 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -606,3 +606,26 @@ async def test_asyncpg_sparsevec(self): assert item.sparse_embedding.to_list() == embedding await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_vector_array(self): + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + + # this fails if the driver does not cast arrays + item = await session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] + + await engine.dispose() From f7eeb3a04554b9adf82a5073d08fc757c41604a3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:36:40 -0800 Subject: [PATCH 049/123] Improved test code [skip ci] --- tests/test_sqlalchemy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 7e8b888..f3d045f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -492,7 +492,7 @@ def test_binary_quantize(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_async(self): + async def test_async_avg(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -622,8 +622,6 @@ async def test_asyncpg_vector_array(self): async with async_session() as session: async with session.begin(): session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - - # this fails if the driver does not cast arrays item = await session.get(Item, 1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] From 2d2563d702ee319a33d17b27549bce035a6c7348 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:39:55 -0800 Subject: [PATCH 050/123] Improved test names [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f3d045f..fd46e74 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -492,7 +492,7 @@ def test_binary_quantize(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_async_avg(self): + async def test_psycopg_async_avg(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -507,7 +507,7 @@ async def test_async_avg(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_async_vector_array(self): + async def test_psycopg_async_vector_array(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From d828239fb466e11a8fb02c7e35a052dbbce3e5b8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 01:00:50 -0800 Subject: [PATCH 051/123] Revert "Updated examples [skip ci]" This reverts commit a3c611f3f141a00c42b311f387278bb4f3ee4bcf. --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 794cf91..5a59c9d 100644 --- a/README.md +++ b/README.md @@ -175,10 +175,10 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import VECTOR +from pgvector.sqlalchemy import Vector class Item(Base): - embedding = mapped_column(VECTOR(3)) + embedding = mapped_column(Vector(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` @@ -274,11 +274,11 @@ session.scalars(select(Item).order_by(order).limit(5)) Add an array column ```python -from pgvector.sqlalchemy import VECTOR +from pgvector.sqlalchemy import Vector from sqlalchemy import ARRAY class Item(Base): - embeddings = mapped_column(ARRAY(VECTOR(3))) + embeddings = mapped_column(ARRAY(Vector(3))) ``` And register the types with the underlying driver @@ -327,10 +327,10 @@ session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import VECTOR +from pgvector.sqlalchemy import Vector class Item(SQLModel, table=True): - embedding: Any = Field(sa_type=VECTOR(3)) + embedding: Any = Field(sa_type=Vector(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` From 8a7040d2ee79ac8fc6313538ffbc38ebad3ac197 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 01:34:02 -0800 Subject: [PATCH 052/123] Removed unneeded code [skip ci] --- examples/citus/example.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/citus/example.py b/examples/citus/example.py index d448204..915c25f 100644 --- a/examples/citus/example.py +++ b/examples/citus/example.py @@ -40,9 +40,6 @@ for i in range(rows): copy.write_row([embeddings[i], categories[i]]) - while conn.pgconn.flush() == 1: - pass - print('Creating index in parallel') conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') From 00cd08e6c44077b99f378edbd007b2483ff406f7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:09:23 -0800 Subject: [PATCH 053/123] Improved tests --- tests/test_sqlalchemy.py | 139 ++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 68 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index fd46e74..405cd21 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -76,11 +76,11 @@ class Item(Base): def create_items(): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) - session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) - session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) - session.commit() + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) + session.commit() class TestSqlalchemy: @@ -129,11 +129,11 @@ def test_orm(self): item2 = Item(embedding=[4, 5, 6]) item3 = Item() - session = Session(engine) - session.add(item) - session.add(item2) - session.add(item3) - session.commit() + with Session(engine) as session: + session.add(item) + session.add(item2) + session.add(item3) + session.commit() stmt = select(Item) with Session(engine) as session: @@ -148,11 +148,11 @@ def test_orm(self): assert items[2].embedding is None def test_vector(self): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] def test_vector_l2_distance(self): create_items() @@ -203,11 +203,11 @@ def test_vector_l1_distance_orm(self): assert [v.id for v in items] == [1, 3, 2] def test_halfvec(self): - session = Session(engine) - session.add(Item(id=1, half_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] def test_halfvec_l2_distance(self): create_items() @@ -258,11 +258,11 @@ def test_halfvec_l1_distance_orm(self): assert [v.id for v in items] == [1, 3, 2] def test_bit(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' + with Session(engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' def test_bit_hamming_distance(self): create_items() @@ -289,11 +289,11 @@ def test_bit_jaccard_distance_orm(self): assert [v.id for v in items] == [2, 3, 1] def test_sparsevec(self): - session = Session(engine) - session.add(Item(id=1, sparse_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() @@ -405,24 +405,24 @@ def test_sum_orm(self): def test_bad_dimensions(self): item = Item(embedding=[1, 2]) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() def test_bad_ndim(self): item = Item(embedding=np.array([[1, 2, 3]])) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected ndim to be 1'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected ndim to be 1'): + session.commit() def test_bad_dtype(self): item = Item(embedding=np.array(['one', 'two', 'three'])) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='could not convert string to float'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='could not convert string to float'): + session.commit() def test_inspect(self): columns = inspect(engine).get_columns('sqlalchemy_orm_item') @@ -433,14 +433,17 @@ def test_literal_binds(self): assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) def test_insert(self): - session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) + with Session(engine) as session: + session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) def test_insert_bulk(self): - session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) + with Session(engine) as session: + session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) # register_vector in psycopg2 tests change this behavior # def test_insert_text(self): - # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + # with Session(engine) as session: + # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) def test_automap(self): metadata = MetaData() @@ -448,29 +451,30 @@ def test_automap(self): AutoBase = automap_base(metadata=metadata) AutoBase.prepare() AutoItem = AutoBase.classes.sqlalchemy_orm_item - session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) - item = session.query(AutoItem).first() - assert item.embedding.tolist() == [1, 2, 3] + with Session(engine) as session: + session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) + item = session.query(AutoItem).first() + assert item.embedding.tolist() == [1, 2, 3] def test_vector_array(self): - session = Session(array_engine) - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - session.commit() + with Session(array_engine) as session: + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] def test_halfvec_array(self): - session = Session(array_engine) - session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - session.commit() + with Session(array_engine) as session: + session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.half_embeddings[0].to_list() == [1, 2, 3] - assert item.half_embeddings[1].to_list() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.half_embeddings[0].to_list() == [1, 2, 3] + assert item.half_embeddings[1].to_list() == [4, 5, 6] def test_half_precision(self): create_items() @@ -479,13 +483,12 @@ def test_half_precision(self): assert [v.id for v in items] == [1, 3, 2] def test_binary_quantize(self): - session = Session(engine) - session.add(Item(id=1, embedding=[-1, -2, -3])) - session.add(Item(id=2, embedding=[1, -2, 3])) - session.add(Item(id=3, embedding=[1, 2, 3])) - session.commit() - with Session(engine) as session: + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] From 7837e92d72eef265e075e8ea5aa305e159e41437 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:15:18 -0800 Subject: [PATCH 054/123] Added more tests for SQLAlchemy --- tests/test_sqlalchemy.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 405cd21..79b3c50 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -29,6 +29,8 @@ def connect(dbapi_connection, connection_record): register_vector(dbapi_connection, globally=False, arrays=True) +psycopg3_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + Base = declarative_base() @@ -493,6 +495,34 @@ def test_binary_quantize(self): items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] + def test_psycopg_vector(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] + + def test_psycopg_halfvec(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] + + def test_psycopg_bit(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + + def test_psycopg_sparsevec(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_list() == [1, 2, 3] + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_psycopg_async_avg(self): From f08cec7a0522b19942a02df14f3f396f0773c912 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:46:42 -0800 Subject: [PATCH 055/123] Parameterize SQLAlchemy tests --- tests/test_sqlalchemy.py | 181 ++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 97 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 79b3c50..a4ac860 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -15,8 +15,15 @@ mapped_column = Column sqlalchemy_version = 1 -engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -with Session(engine) as session: +psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +engines = [psycopg2_engine] + +if sqlalchemy_version > 1: + psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + engines.append(psycopg_engine) + +setup_engine = engines[0] +with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() @@ -29,8 +36,6 @@ def connect(dbapi_connection, connection_record): register_vector(dbapi_connection, globally=False, arrays=True) -psycopg3_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') - Base = declarative_base() @@ -46,8 +51,8 @@ class Item(Base): half_embeddings = mapped_column(ARRAY(HALFVEC(3))) -Base.metadata.drop_all(engine) -Base.metadata.create_all(engine) +Base.metadata.drop_all(setup_engine) +Base.metadata.create_all(setup_engine) index = Index( 'sqlalchemy_orm_index', @@ -56,7 +61,7 @@ class Item(Base): postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_l2_ops'} ) -index.create(engine) +index.create(setup_engine) half_precision_index = Index( 'sqlalchemy_orm_half_precision_index', @@ -65,7 +70,7 @@ class Item(Base): postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'halfvec_l2_ops'} ) -half_precision_index.create(engine) +half_precision_index.create(setup_engine) binary_quantize_index = Index( 'sqlalchemy_orm_binary_quantize_index', @@ -74,24 +79,29 @@ class Item(Base): postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'bit_hamming_ops'} ) -binary_quantize_index.create(engine) +binary_quantize_index.create(setup_engine) def create_items(): - with Session(engine) as session: + with Session(setup_engine) as session: session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) session.commit() +def delete_items(): + with Session(setup_engine) as session: + session.query(Item).delete() + session.commit() + + +@pytest.mark.parametrize("engine", engines) class TestSqlalchemy: - def setup_method(self, test_method): - with Session(engine) as session: - session.query(Item).delete() - session.commit() + def setup_method(self): + delete_items() - def test_core(self): + def test_core(self, engine): metadata = MetaData() item_table = Table( @@ -126,7 +136,7 @@ def test_core(self): ) hnsw_index.create(engine) - def test_orm(self): + def test_orm(self, engine): item = Item(embedding=np.array([1.5, 2, 3])) item2 = Item(embedding=[4, 5, 6]) item3 = Item() @@ -140,236 +150,236 @@ def test_orm(self): stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id == 1 - assert items[1].id == 2 - assert items[2].id == 3 + assert items[0].id in [1, 4] + assert items[1].id in [2, 5] + assert items[2].id in [3, 6] assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None - def test_vector(self): + def test_vector(self, engine): with Session(engine) as session: session.add(Item(id=1, embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) assert item.embedding.tolist() == [1, 2, 3] - def test_vector_l2_distance(self): + def test_vector_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_vector_l2_distance_orm(self): + def test_vector_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_vector_max_inner_product(self): + def test_vector_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_vector_max_inner_product_orm(self): + def test_vector_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_vector_cosine_distance(self): + def test_vector_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_vector_cosine_distance_orm(self): + def test_vector_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_vector_l1_distance(self): + def test_vector_l1_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_vector_l1_distance_orm(self): + def test_vector_l1_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_halfvec(self): + def test_halfvec(self, engine): with Session(engine) as session: session.add(Item(id=1, half_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) assert item.half_embedding.to_list() == [1, 2, 3] - def test_halfvec_l2_distance(self): + def test_halfvec_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_halfvec_l2_distance_orm(self): + def test_halfvec_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_halfvec_max_inner_product(self): + def test_halfvec_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_halfvec_max_inner_product_orm(self): + def test_halfvec_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_halfvec_cosine_distance(self): + def test_halfvec_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_halfvec_cosine_distance_orm(self): + def test_halfvec_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_halfvec_l1_distance(self): + def test_halfvec_l1_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_halfvec_l1_distance_orm(self): + def test_halfvec_l1_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_bit(self): + def test_bit(self, engine): with Session(engine) as session: session.add(Item(id=1, binary_embedding='101')) session.commit() item = session.get(Item, 1) assert item.binary_embedding == '101' - def test_bit_hamming_distance(self): + def test_bit_hamming_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] - def test_bit_hamming_distance_orm(self): + def test_bit_hamming_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) assert [v.id for v in items] == [2, 3, 1] - def test_bit_jaccard_distance(self): + def test_bit_jaccard_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] - def test_bit_jaccard_distance_orm(self): + def test_bit_jaccard_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) assert [v.id for v in items] == [2, 3, 1] - def test_sparsevec(self): + def test_sparsevec(self, engine): with Session(engine) as session: session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) assert item.sparse_embedding.to_list() == [1, 2, 3] - def test_sparsevec_l2_distance(self): + def test_sparsevec_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_sparsevec_l2_distance_orm(self): + def test_sparsevec_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_sparsevec_max_inner_product(self): + def test_sparsevec_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_sparsevec_max_inner_product_orm(self): + def test_sparsevec_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_sparsevec_cosine_distance(self): + def test_sparsevec_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_sparsevec_cosine_distance_orm(self): + def test_sparsevec_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_sparsevec_l1_distance(self): + def test_sparsevec_l1_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_sparsevec_l1_distance_orm(self): + def test_sparsevec_l1_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_filter(self): + def test_filter(self, engine): create_items() with Session(engine) as session: items = session.query(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1).all() assert [v.id for v in items] == [1] - def test_filter_orm(self): + def test_filter_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1)) assert [v.id for v in items] == [1] - def test_select(self): + def test_select(self, engine): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) items = session.query(Item.embedding.l2_distance([1, 1, 1])).first() assert items[0] == 3 - def test_select_orm(self): + def test_select_orm(self, engine): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) items = session.scalars(select(Item.embedding.l2_distance([1, 1, 1]))).all() assert items[0] == 3 - def test_avg(self): + def test_avg(self, engine): with Session(engine) as session: res = session.query(avg(Item.embedding)).first()[0] assert res is None @@ -378,7 +388,7 @@ def test_avg(self): res = session.query(avg(Item.embedding)).first()[0] assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) - def test_avg_orm(self): + def test_avg_orm(self, engine): with Session(engine) as session: res = session.scalars(select(avg(Item.embedding))).first() assert res is None @@ -387,7 +397,7 @@ def test_avg_orm(self): res = session.scalars(select(avg(Item.embedding))).first() assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) - def test_sum(self): + def test_sum(self, engine): with Session(engine) as session: res = session.query(sum(Item.embedding)).first()[0] assert res is None @@ -396,7 +406,7 @@ def test_sum(self): res = session.query(sum(Item.embedding)).first()[0] assert np.array_equal(res, np.array([5, 7, 9])) - def test_sum_orm(self): + def test_sum_orm(self, engine): with Session(engine) as session: res = session.scalars(select(sum(Item.embedding))).first() assert res is None @@ -405,40 +415,40 @@ def test_sum_orm(self): res = session.scalars(select(sum(Item.embedding))).first() assert np.array_equal(res, np.array([5, 7, 9])) - def test_bad_dimensions(self): + def test_bad_dimensions(self, engine): item = Item(embedding=[1, 2]) with Session(engine) as session: session.add(item) with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): session.commit() - def test_bad_ndim(self): + def test_bad_ndim(self, engine): item = Item(embedding=np.array([[1, 2, 3]])) with Session(engine) as session: session.add(item) with pytest.raises(StatementError, match='expected ndim to be 1'): session.commit() - def test_bad_dtype(self): + def test_bad_dtype(self, engine): item = Item(embedding=np.array(['one', 'two', 'three'])) with Session(engine) as session: session.add(item) with pytest.raises(StatementError, match='could not convert string to float'): session.commit() - def test_inspect(self): + def test_inspect(self, engine): columns = inspect(engine).get_columns('sqlalchemy_orm_item') assert isinstance(columns[1]['type'], VECTOR) - def test_literal_binds(self): + def test_literal_binds(self, engine): sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(engine, compile_kwargs={'literal_binds': True}) assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) - def test_insert(self): + def test_insert(self, engine): with Session(engine) as session: session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) - def test_insert_bulk(self): + def test_insert_bulk(self, engine): with Session(engine) as session: session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) @@ -447,7 +457,7 @@ def test_insert_bulk(self): # with Session(engine) as session: # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) - def test_automap(self): + def test_automap(self, engine): metadata = MetaData() metadata.reflect(engine, only=['sqlalchemy_orm_item']) AutoBase = automap_base(metadata=metadata) @@ -458,7 +468,7 @@ def test_automap(self): item = session.query(AutoItem).first() assert item.embedding.tolist() == [1, 2, 3] - def test_vector_array(self): + def test_vector_array(self, engine): with Session(array_engine) as session: session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -468,7 +478,7 @@ def test_vector_array(self): assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] - def test_halfvec_array(self): + def test_halfvec_array(self, engine): with Session(array_engine) as session: session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -478,13 +488,13 @@ def test_halfvec_array(self): assert item.half_embeddings[0].to_list() == [1, 2, 3] assert item.half_embeddings[1].to_list() == [4, 5, 6] - def test_half_precision(self): + def test_half_precision(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_binary_quantize(self): + def test_binary_quantize(self, engine): with Session(engine) as session: session.add(Item(id=1, embedding=[-1, -2, -3])) session.add(Item(id=2, embedding=[1, -2, 3])) @@ -495,33 +505,10 @@ def test_binary_quantize(self): items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] - def test_psycopg_vector(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] - - def test_psycopg_halfvec(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, half_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] - def test_psycopg_bit(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' - - def test_psycopg_sparsevec(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, sparse_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] +class TestSqlalchemyAsync: + def setup_method(self): + delete_items() @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') From 5e381602a739ca5307f02c75ee57d219555f5ada Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:54:57 -0800 Subject: [PATCH 056/123] Improved array tests --- tests/test_sqlalchemy.py | 49 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index a4ac860..b1f3e85 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -27,10 +27,11 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() -array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +array_engines = [psycopg2_array_engine] -@event.listens_for(array_engine, "connect") +@event.listens_for(psycopg2_array_engine, "connect") def connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector register_vector(dbapi_connection, globally=False, arrays=True) @@ -468,8 +469,31 @@ def test_automap(self, engine): item = session.query(AutoItem).first() assert item.embedding.tolist() == [1, 2, 3] + def test_half_precision(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_binary_quantize(self, engine): + with Session(engine) as session: + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) + items = session.query(Item).order_by(distance).all() + assert [v.id for v in items] == [2, 3, 1] + + +@pytest.mark.parametrize("engine", array_engines) +class TestSqlalchemyArray: + def setup_method(self): + delete_items() + def test_vector_array(self, engine): - with Session(array_engine) as session: + with Session(engine) as session: session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -479,7 +503,7 @@ def test_vector_array(self, engine): assert item.embeddings[1].tolist() == [4, 5, 6] def test_halfvec_array(self, engine): - with Session(array_engine) as session: + with Session(engine) as session: session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -488,23 +512,6 @@ def test_halfvec_array(self, engine): assert item.half_embeddings[0].to_list() == [1, 2, 3] assert item.half_embeddings[1].to_list() == [4, 5, 6] - def test_half_precision(self, engine): - create_items() - with Session(engine) as session: - items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() - assert [v.id for v in items] == [1, 3, 2] - - def test_binary_quantize(self, engine): - with Session(engine) as session: - session.add(Item(id=1, embedding=[-1, -2, -3])) - session.add(Item(id=2, embedding=[1, -2, 3])) - session.add(Item(id=3, embedding=[1, 2, 3])) - session.commit() - - distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) - items = session.query(Item).order_by(distance).all() - assert [v.id for v in items] == [2, 3, 1] - class TestSqlalchemyAsync: def setup_method(self): From f82e44f231e498c86839735de1658ef7b8cb11a1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:01:45 -0800 Subject: [PATCH 057/123] Added tests for SQLAlchemy with pg8000 --- requirements.txt | 1 + tests/test_sqlalchemy.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0e30959..a13be06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ asyncpg Django numpy peewee +pg8000 psycopg[binary,pool] psycopg2-binary pytest diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index b1f3e85..37e803d 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,4 +1,5 @@ import numpy as np +import os from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY @@ -16,7 +17,8 @@ sqlalchemy_version = 1 psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -engines = [psycopg2_engine] +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ['USER']}@localhost/pgvector_python_test') +engines = [psycopg2_engine, pg8000_engine] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') @@ -151,9 +153,9 @@ def test_orm(self, engine): stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id in [1, 4] - assert items[1].id in [2, 5] - assert items[2].id in [3, 6] + assert items[0].id in [1, 4, 7] + assert items[1].id in [2, 5, 8] + assert items[2].id in [3, 6, 9] assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) @@ -290,12 +292,18 @@ def test_bit_hamming_distance_orm(self, engine): assert [v.id for v in items] == [2, 3, 1] def test_bit_jaccard_distance(self, engine): + if engine == pg8000_engine: + return + create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] def test_bit_jaccard_distance_orm(self, engine): + if engine == pg8000_engine: + return + create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) From cfcc2ea7b8b942c47c378bf47a4490c5acb50ec7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:02:44 -0800 Subject: [PATCH 058/123] Updated style [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 37e803d..aa2ad97 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -99,7 +99,7 @@ def delete_items(): session.commit() -@pytest.mark.parametrize("engine", engines) +@pytest.mark.parametrize('engine', engines) class TestSqlalchemy: def setup_method(self): delete_items() @@ -495,7 +495,7 @@ def test_binary_quantize(self, engine): assert [v.id for v in items] == [2, 3, 1] -@pytest.mark.parametrize("engine", array_engines) +@pytest.mark.parametrize('engine', array_engines) class TestSqlalchemyArray: def setup_method(self): delete_items() From 95403d5268e11ab6efef969f46f086e3f57e2b52 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:05:34 -0800 Subject: [PATCH 059/123] Added tests for arrays with SQLAlchemy and Psycopg 3 --- tests/test_sqlalchemy.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index aa2ad97..f4a6bce 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -34,11 +34,21 @@ @event.listens_for(psycopg2_array_engine, "connect") -def connect(dbapi_connection, connection_record): +def psycopg2_connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector register_vector(dbapi_connection, globally=False, arrays=True) +if sqlalchemy_version > 1: + psycopg_array_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + array_engines.append(psycopg_array_engine) + + @event.listens_for(psycopg_array_engine, "connect") + def psycopg_connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector + register_vector(dbapi_connection) + + Base = declarative_base() From c74e090f26a02fc920ef910265ac0e4f2eb7cbde Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:11:46 -0800 Subject: [PATCH 060/123] Fixed CI --- tests/test_sqlalchemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f4a6bce..7dbc565 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -17,7 +17,7 @@ sqlalchemy_version = 1 psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ['USER']}@localhost/pgvector_python_test') +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') engines = [psycopg2_engine, pg8000_engine] if sqlalchemy_version > 1: From b350d6a8d45d02ea954fad945d194896c50fbc1e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:13:46 -0800 Subject: [PATCH 061/123] Simplified test code [skip ci] --- tests/test_django.py | 2 +- tests/test_peewee.py | 2 +- tests/test_psycopg.py | 2 +- tests/test_psycopg2.py | 2 +- tests/test_sqlmodel.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 2c53d82..ea15771 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -158,7 +158,7 @@ class Meta: class TestDjango: - def setup_method(self, test_method): + def setup_method(self): Item.objects.all().delete() def test_vector(self): diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 9666388..e98a0ec 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -36,7 +36,7 @@ def create_items(): class TestPeewee: - def setup_method(self, test_method): + def setup_method(self): Item.truncate_table() def test_vector(self): diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 6d4f34a..90f80b6 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -14,7 +14,7 @@ class TestPsycopg: - def setup_method(self, test_method): + def setup_method(self): conn.execute('DELETE FROM psycopg_items') def test_vector(self): diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index c3cd3cd..d661f12 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -16,7 +16,7 @@ class TestPsycopg2: - def setup_method(self, test_method): + def setup_method(self): cur.execute('DELETE FROM psycopg2_items') def test_vector(self): diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 851afd8..e0330d2 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -42,7 +42,7 @@ def create_items(): class TestSqlmodel: - def setup_method(self, test_method): + def setup_method(self): with Session(engine) as session: session.exec(delete(Item)) session.commit() From 651df0844b3c6790414ec2e8ed75330ad80406af Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:15:52 -0800 Subject: [PATCH 062/123] Improved SQLModel tests [skip ci] --- tests/test_sqlmodel.py | 68 +++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index e0330d2..373834f 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -34,11 +34,11 @@ class Item(SQLModel, table=True): def create_items(): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) - session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) - session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) - session.commit() + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) + session.commit() class TestSqlmodel: @@ -52,11 +52,11 @@ def test_orm(self): item2 = Item(embedding=[4, 5, 6]) item3 = Item() - session = Session(engine) - session.add(item) - session.add(item2) - session.add(item3) - session.commit() + with Session(engine) as session: + session.add(item) + session.add(item2) + session.add(item3) + session.commit() stmt = select(Item) with Session(engine) as session: @@ -71,11 +71,11 @@ def test_orm(self): assert items[2].embedding is None def test_vector(self): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] def test_vector_l2_distance(self): create_items() @@ -102,11 +102,11 @@ def test_vector_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] def test_halfvec(self): - session = Session(engine) - session.add(Item(id=1, half_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] def test_halfvec_l2_distance(self): create_items() @@ -133,11 +133,11 @@ def test_halfvec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] def test_bit(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' + with Session(engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' def test_bit_hamming_distance(self): create_items() @@ -152,11 +152,11 @@ def test_bit_jaccard_distance(self): assert [v.id for v in items] == [2, 3, 1] def test_sparsevec(self): - session = Session(engine) - session.add(Item(id=1, sparse_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() @@ -232,7 +232,7 @@ def test_halfvec_sum(self): def test_bad_dimensions(self): item = Item(embedding=[1, 2]) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() From 2883156b461f08fe32be81439d8e653ac1c41c5a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:42:19 -0800 Subject: [PATCH 063/123] Improved tests for async SQLAlchemy engines [skip ci] --- tests/test_sqlalchemy.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 7dbc565..6e1d496 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -19,11 +19,18 @@ psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') engines = [psycopg2_engine, pg8000_engine] +async_engines = [] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') engines.append(psycopg_engine) + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async_engines.append(psycopg_async_engine) + + asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_engines.append(asyncpg_engine) + setup_engine = engines[0] with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) @@ -531,14 +538,13 @@ def test_halfvec_array(self, engine): assert item.half_embeddings[1].to_list() == [4, 5, 6] +@pytest.mark.parametrize('engine', async_engines) class TestSqlalchemyAsync: def setup_method(self): delete_items() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_psycopg_async_avg(self): - engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async def test_psycopg_async_avg(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -550,6 +556,11 @@ async def test_psycopg_async_avg(self): await engine.dispose() + +class TestSqlalchemyAsync2: + def setup_method(self): + delete_items() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_psycopg_async_vector_array(self): From 86331f0ee6650adcdb655b5d092f1c24d3b0fa84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:45:22 -0800 Subject: [PATCH 064/123] Improved tests for async SQLAlchemy engines [skip ci] --- tests/test_sqlalchemy.py | 104 +++++++++++++++------------------------ 1 file changed, 39 insertions(+), 65 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 6e1d496..689615b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -544,122 +544,96 @@ def setup_method(self): delete_items() @pytest.mark.asyncio - async def test_psycopg_async_avg(self, engine): + async def test_vector(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: async with session.begin(): - session.add(Item(embedding=[1, 2, 3])) - session.add(Item(embedding=[4, 5, 6])) - avg = await session.scalars(select(func.avg(Item.embedding))) - assert avg.first() == '[2.5,3.5,4.5]' + embedding = np.array([1, 2, 3]) + session.add(Item(id=1, embedding=embedding)) + item = await session.get(Item, 1) + assert np.array_equal(item.embedding, embedding) await engine.dispose() - -class TestSqlalchemyAsync2: - def setup_method(self): - delete_items() - @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_psycopg_async_vector_array(self): - engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async def test_halfvec(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) - @event.listens_for(engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): - from pgvector.psycopg import register_vector_async - dbapi_connection.run_async(register_vector_async) - async with async_session() as session: async with session.begin(): - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - - # this fails if the driver does not cast arrays + embedding = [1, 2, 3] + session.add(Item(id=1, half_embedding=embedding)) item = await session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert item.half_embedding.to_list() == embedding await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_vector(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') - async_session = async_sessionmaker(engine, expire_on_commit=False) + async def test_bit(self, engine): + import asyncpg - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) + async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: async with session.begin(): - embedding = np.array([1, 2, 3]) - session.add(Item(id=1, embedding=embedding)) + embedding = asyncpg.BitString('101') if engine == asyncpg_engine else '101' + session.add(Item(id=1, binary_embedding=embedding)) item = await session.get(Item, 1) - assert np.array_equal(item.embedding, embedding) + assert item.binary_embedding == embedding await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_halfvec(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async def test_sparsevec(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) - async with async_session() as session: async with session.begin(): embedding = [1, 2, 3] - session.add(Item(id=1, half_embedding=embedding)) + session.add(Item(id=1, sparse_embedding=embedding)) item = await session.get(Item, 1) - assert item.half_embedding.to_list() == embedding + assert item.sparse_embedding.to_list() == embedding await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_bit(self): - import asyncpg - - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async def test_avg(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: async with session.begin(): - embedding = asyncpg.BitString('101') - session.add(Item(id=1, binary_embedding=embedding)) - item = await session.get(Item, 1) - assert item.binary_embedding == embedding + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + avg = await session.scalars(select(func.avg(Item.embedding))) + assert avg.first() == '[2.5,3.5,4.5]' await engine.dispose() + +class TestSqlalchemyAsync2: + def setup_method(self): + delete_items() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_sparsevec(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async def test_psycopg_async_vector_array(self): + engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) + @event.listens_for(engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) async with async_session() as session: async with session.begin(): - embedding = [1, 2, 3] - session.add(Item(id=1, sparse_embedding=embedding)) + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + + # this fails if the driver does not cast arrays item = await session.get(Item, 1) - assert item.sparse_embedding.to_list() == embedding + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] await engine.dispose() From 224c18a47c2a2d652fe1f7267449e61c290b249f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:49:14 -0800 Subject: [PATCH 065/123] Simplified test code [skip ci] --- tests/test_sqlalchemy.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 689615b..07e29e7 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,3 +1,4 @@ +import asyncpg import numpy as np import os from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum @@ -571,8 +572,6 @@ async def test_halfvec(self, engine): @pytest.mark.asyncio async def test_bit(self, engine): - import asyncpg - async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -611,13 +610,13 @@ async def test_avg(self, engine): await engine.dispose() -class TestSqlalchemyAsync2: +@pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') +class TestSqlalchemyAsyncArray: def setup_method(self): delete_items() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_psycopg_async_vector_array(self): + async def test_psycopg_vector_array(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -638,7 +637,6 @@ def connect(dbapi_connection, connection_record): await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_vector_array(self): engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From a1d89971cd10ebe0dc11969ec532011eaa8a9a78 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:02:05 -0800 Subject: [PATCH 066/123] Improved test code [skip ci] --- tests/test_sqlalchemy.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 07e29e7..8868df4 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -19,7 +19,17 @@ psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') +psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') + + +@event.listens_for(psycopg2_array_engine, "connect") +def psycopg2_connect(dbapi_connection, connection_record): + from pgvector.psycopg2 import register_vector + register_vector(dbapi_connection, globally=False, arrays=True) + + engines = [psycopg2_engine, pg8000_engine] +array_engines = [psycopg2_array_engine] async_engines = [] if sqlalchemy_version > 1: @@ -32,22 +42,6 @@ asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_engines.append(asyncpg_engine) -setup_engine = engines[0] -with Session(setup_engine) as session: - session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) - session.commit() - -psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -array_engines = [psycopg2_array_engine] - - -@event.listens_for(psycopg2_array_engine, "connect") -def psycopg2_connect(dbapi_connection, connection_record): - from pgvector.psycopg2 import register_vector - register_vector(dbapi_connection, globally=False, arrays=True) - - -if sqlalchemy_version > 1: psycopg_array_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') array_engines.append(psycopg_array_engine) @@ -56,6 +50,10 @@ def psycopg_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector register_vector(dbapi_connection) +setup_engine = engines[0] +with Session(setup_engine) as session: + session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) + session.commit() Base = declarative_base() From c792451a76fea51352ad0a5f952c97eaeaea70d7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:12:57 -0800 Subject: [PATCH 067/123] Test more engine configurations --- tests/test_sqlalchemy.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 8868df4..dffa07b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -19,37 +19,39 @@ psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') -psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +psycopg2_type_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -@event.listens_for(psycopg2_array_engine, "connect") +@event.listens_for(psycopg2_type_engine, "connect") def psycopg2_connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector register_vector(dbapi_connection, globally=False, arrays=True) -engines = [psycopg2_engine, pg8000_engine] -array_engines = [psycopg2_array_engine] +engines = [psycopg2_engine, pg8000_engine, psycopg2_type_engine] +array_engines = [psycopg2_type_engine] async_engines = [] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') engines.append(psycopg_engine) + psycopg_type_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + + @event.listens_for(psycopg_type_engine, "connect") + def psycopg_connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector + register_vector(dbapi_connection) + + engines.append(psycopg_type_engine) + array_engines.append(psycopg_type_engine) + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_engines.append(psycopg_async_engine) asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_engines.append(asyncpg_engine) - psycopg_array_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') - array_engines.append(psycopg_array_engine) - - @event.listens_for(psycopg_array_engine, "connect") - def psycopg_connect(dbapi_connection, connection_record): - from pgvector.psycopg import register_vector - register_vector(dbapi_connection) - setup_engine = engines[0] with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) @@ -169,9 +171,10 @@ def test_orm(self, engine): stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id in [1, 4, 7] - assert items[1].id in [2, 5, 8] - assert items[2].id in [3, 6, 9] + # TODO improve + assert items[0].id % 3 == 1 + assert items[1].id % 3 == 2 + assert items[2].id % 3 == 0 assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) From 88873e54365ca6086a1c960e2ced19ee98ea2bb2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:21:36 -0800 Subject: [PATCH 068/123] Improved tests for async SQLAlchemy engines [skip ci] --- tests/test_sqlalchemy.py | 58 +++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index dffa07b..a245ffc 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -31,6 +31,7 @@ def psycopg2_connect(dbapi_connection, connection_record): engines = [psycopg2_engine, pg8000_engine, psycopg2_type_engine] array_engines = [psycopg2_type_engine] async_engines = [] +async_array_engines = [] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') @@ -46,11 +47,32 @@ def psycopg_connect(dbapi_connection, connection_record): engines.append(psycopg_type_engine) array_engines.append(psycopg_type_engine) + psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + + @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) + + async_engines.append(psycopg_async_type_engine) + async_array_engines.append(psycopg_async_type_engine) + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_engines.append(psycopg_async_engine) asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_engines.append(asyncpg_engine) + async_array_engines.append(asyncpg_engine) + + asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + + @event.listens_for(asyncpg_type_engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.asyncpg import register_vector + dbapi_connection.run_async(register_vector) + + # TODO do not throw error when types are registered + # async_array_engines.append(asyncpg_type_engine) setup_engine = engines[0] with Session(setup_engine) as session: @@ -599,6 +621,10 @@ async def test_sparsevec(self, engine): @pytest.mark.asyncio async def test_avg(self, engine): + # TODO do not throw error when types are registered + if engine == psycopg_async_type_engine: + return + async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -611,43 +637,15 @@ async def test_avg(self, engine): await engine.dispose() -@pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') +@pytest.mark.parametrize('engine', async_array_engines) class TestSqlalchemyAsyncArray: def setup_method(self): delete_items() @pytest.mark.asyncio - async def test_psycopg_vector_array(self): - engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async def test_vector_array(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) - @event.listens_for(engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): - from pgvector.psycopg import register_vector_async - dbapi_connection.run_async(register_vector_async) - - async with async_session() as session: - async with session.begin(): - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - - # this fails if the driver does not cast arrays - item = await session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] - - await engine.dispose() - - @pytest.mark.asyncio - async def test_asyncpg_vector_array(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') - async_session = async_sessionmaker(engine, expire_on_commit=False) - - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) - async with async_session() as session: async with session.begin(): session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) From e78a8d5f866c2577644a64eafcaf9939c4b9ab8c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:28:23 -0800 Subject: [PATCH 069/123] Improved test code [skip ci] --- tests/test_sqlalchemy.py | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index a245ffc..4b26922 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -18,7 +18,6 @@ sqlalchemy_version = 1 psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') psycopg2_type_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') @@ -28,15 +27,10 @@ def psycopg2_connect(dbapi_connection, connection_record): register_vector(dbapi_connection, globally=False, arrays=True) -engines = [psycopg2_engine, pg8000_engine, psycopg2_type_engine] -array_engines = [psycopg2_type_engine] -async_engines = [] -async_array_engines = [] +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') - engines.append(psycopg_engine) - psycopg_type_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_type_engine, "connect") @@ -44,9 +38,7 @@ def psycopg_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector register_vector(dbapi_connection) - engines.append(psycopg_type_engine) - array_engines.append(psycopg_type_engine) - + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") @@ -54,16 +46,7 @@ def connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector_async dbapi_connection.run_async(register_vector_async) - async_engines.append(psycopg_async_type_engine) - async_array_engines.append(psycopg_async_type_engine) - - psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') - async_engines.append(psycopg_async_engine) - asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') - async_engines.append(asyncpg_engine) - async_array_engines.append(asyncpg_engine) - asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') @event.listens_for(asyncpg_type_engine.sync_engine, "connect") @@ -71,8 +54,17 @@ def connect(dbapi_connection, connection_record): from pgvector.asyncpg import register_vector dbapi_connection.run_async(register_vector) - # TODO do not throw error when types are registered - # async_array_engines.append(asyncpg_type_engine) +engines = [psycopg2_engine, psycopg2_type_engine, pg8000_engine] +array_engines = [psycopg2_type_engine] +async_engines = [] +async_array_engines = [] + +if sqlalchemy_version > 1: + engines += [psycopg_engine, psycopg_type_engine] + array_engines += [psycopg_type_engine] + async_engines += [psycopg_async_engine, psycopg_async_type_engine, asyncpg_engine] + # TODO add asyncpg_type_engine + async_array_engines += [psycopg_async_type_engine, asyncpg_engine] setup_engine = engines[0] with Session(setup_engine) as session: From a2699639d7fd468ea68442d72227d5099ad8a64b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:33:09 -0800 Subject: [PATCH 070/123] Updated todo [skip ci] --- tests/test_sqlalchemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 4b26922..09df9b9 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -62,8 +62,8 @@ def connect(dbapi_connection, connection_record): if sqlalchemy_version > 1: engines += [psycopg_engine, psycopg_type_engine] array_engines += [psycopg_type_engine] + # TODO support asyncpg_type_engine async_engines += [psycopg_async_engine, psycopg_async_type_engine, asyncpg_engine] - # TODO add asyncpg_type_engine async_array_engines += [psycopg_async_type_engine, asyncpg_engine] setup_engine = engines[0] From 7cd310b5cf986fab1da536a94e9f9c74379e46b9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:36:12 -0800 Subject: [PATCH 071/123] Improved test [skip ci] --- tests/test_sqlalchemy.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 09df9b9..563e3a3 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -613,10 +613,6 @@ async def test_sparsevec(self, engine): @pytest.mark.asyncio async def test_avg(self, engine): - # TODO do not throw error when types are registered - if engine == psycopg_async_type_engine: - return - async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -624,7 +620,10 @@ async def test_avg(self, engine): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) avg = await session.scalars(select(func.avg(Item.embedding))) - assert avg.first() == '[2.5,3.5,4.5]' + if engine == psycopg_async_type_engine: + assert avg.first().tolist() == [2.5, 3.5, 4.5] + else: + assert avg.first() == '[2.5,3.5,4.5]' await engine.dispose() From cae30a1d1b0b23620abb9c9ff4c7084ca5bac1ee Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:37:07 -0800 Subject: [PATCH 072/123] Improved test [skip ci] --- tests/test_sqlalchemy.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 563e3a3..aa5d81f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -619,11 +619,8 @@ async def test_avg(self, engine): async with session.begin(): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - avg = await session.scalars(select(func.avg(Item.embedding))) - if engine == psycopg_async_type_engine: - assert avg.first().tolist() == [2.5, 3.5, 4.5] - else: - assert avg.first() == '[2.5,3.5,4.5]' + res = await session.scalars(select(avg(Item.embedding))) + assert res.first().tolist() == [2.5, 3.5, 4.5] await engine.dispose() From 3de7832d164b82e929e08d928501b081c93e3a5a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:27:35 -0800 Subject: [PATCH 073/123] Dropped support for Python < 3.9 --- .github/workflows/build.yml | 2 +- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 562ba94..dc53dfe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - python: [3.13, 3.8] + python: [3.13, 3.9] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a517d8..1788ff4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.0 (unreleased) + +- Dropped support for Python < 3.9 + ## 0.3.6 (2024-10-26) - Added `arrays` option for Psycopg 2 diff --git a/pyproject.toml b/pyproject.toml index a6a6609..0f291f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ {name = "Andrew Kane", email = "andrew@ankane.org"} ] license = {text = "MIT"} -requires-python = ">= 3.8" +requires-python = ">= 3.9" dependencies = [ "numpy" ] From 37b148f459863ee6f6c448fb93eecef569e7eb40 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:29:04 -0800 Subject: [PATCH 074/123] Removed default value [skip ci] --- pgvector/psycopg2/register.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 08a69a9..2be292f 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,11 +5,10 @@ from .vector import register_vector_info -# TODO remove default value for conn_or_curs in 0.4.0 # TODO make globally False by default in 0.4.0 # note: register_adapter is always global # TODO make arrays True by defalt in 0.4.0 -def register_vector(conn_or_curs=None, globally=True, arrays=False): +def register_vector(conn_or_curs, globally=True, arrays=False): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) scope = None if globally else conn_or_curs From 8a621a3ae96a85320475180b2120cb6d92c095a4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:33:27 -0800 Subject: [PATCH 075/123] Changed default values of globally and arrays for register_type with Psycopg 2 [skip ci] --- pgvector/psycopg2/register.py | 4 +--- tests/test_psycopg2.py | 8 ++++---- tests/test_sqlalchemy.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 2be292f..1bc9d44 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,10 +5,8 @@ from .vector import register_vector_info -# TODO make globally False by default in 0.4.0 # note: register_adapter is always global -# TODO make arrays True by defalt in 0.4.0 -def register_vector(conn_or_curs, globally=True, arrays=False): +def register_vector(conn_or_curs, globally=False, arrays=True): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) scope = None if globally else conn_or_curs diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index d661f12..85aa0e8 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -12,7 +12,7 @@ cur.execute('DROP TABLE IF EXISTS psycopg2_items') cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') -register_vector(cur, globally=False, arrays=True) +register_vector(cur) class TestPsycopg2: @@ -87,13 +87,13 @@ def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test') cur = conn.cursor(cursor_factory=cursor_factory) - register_vector(cur, globally=False) + register_vector(cur) conn.close() def test_cursor_factory_connection(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) - register_vector(conn, globally=False) + register_vector(conn) conn.close() def test_pool(self): @@ -102,7 +102,7 @@ def test_pool(self): conn = pool.getconn() try: # use globally=True for apps to ensure registered with all connections - register_vector(conn, globally=False) + register_vector(conn) finally: pool.putconn(conn) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index aa5d81f..067a153 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -24,7 +24,7 @@ @event.listens_for(psycopg2_type_engine, "connect") def psycopg2_connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector - register_vector(dbapi_connection, globally=False, arrays=True) + register_vector(dbapi_connection) pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') From 32b09c0272545322b90d38139ee625a9a7809a71 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:36:23 -0800 Subject: [PATCH 076/123] Fixed indices and values methods returning tuple instead of list in certain cases [skip ci] --- pgvector/utils/sparsevec.py | 3 +-- tests/test_psycopg.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index a370c5e..0398106 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -108,8 +108,7 @@ def from_binary(cls, value): dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) - # TODO convert indices and values to lists in 0.4.0 - return cls._from_parts(int(dim), indices, values) + return cls._from_parts(int(dim), list(indices), list(values)) @classmethod def _from_parts(cls, dim, indices, values): diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 90f80b6..cf5f09a 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -111,9 +111,8 @@ def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res.dimensions() == 6 - # TODO convert indices and values to lists in 0.4.0 - assert res.indices() == (0, 2, 4) - assert res.values() == (1.5, 2, 3) + assert res.indices() == [0, 2, 4] + assert res.values() == [1.5, 2, 3] assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) From c10799c3dec3dea699fc4590d3c3baa688023b23 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:38:50 -0800 Subject: [PATCH 077/123] Added support for Vector class to Psycopg 2 [skip ci] --- pgvector/psycopg2/__init__.py | 3 ++- pgvector/psycopg2/vector.py | 1 + tests/test_psycopg2.py | 12 +++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 7c95295..f109203 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,8 +1,9 @@ from .register import register_vector -from ..utils import HalfVector, SparseVector +from ..utils import HalfVector, SparseVector, Vector __all__ = [ 'register_vector', + 'Vector', 'HalfVector', 'SparseVector' ] diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 9861f01..5bd00bb 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -24,3 +24,4 @@ def register_vector_info(oid, array_oid, scope): register_type(vectorarray, scope) register_adapter(np.ndarray, VectorAdapter) + register_adapter(Vector, VectorAdapter) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 85aa0e8..f927d86 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg2 import register_vector, HalfVector, SparseVector +from pgvector.psycopg2 import register_vector, Vector, HalfVector, SparseVector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor from psycopg2.pool import ThreadedConnectionPool @@ -29,6 +29,16 @@ def test_vector(self): assert res[0][0].dtype == np.float32 assert res[1][0] is None + def test_vector_class(self): + embedding = Vector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert np.array_equal(res[0][0], embedding.to_numpy()) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + def test_halfvec(self): embedding = [1.5, 2, 3] cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) From e138e55d187c1b03f9b827c6849aabb1f9697eaf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:50:52 -0800 Subject: [PATCH 078/123] Fixed equality for types [skip ci] --- pgvector/django/functions.py | 4 ++++ pgvector/utils/bit.py | 5 +++++ pgvector/utils/halfvec.py | 5 +++++ pgvector/utils/sparsevec.py | 5 +++++ pgvector/utils/vector.py | 5 +++++ tests/test_bit.py | 4 ++++ tests/test_half_vector.py | 4 ++++ tests/test_sparse_vector.py | 5 +++++ tests/test_vector.py | 4 ++++ 9 files changed, 41 insertions(+) diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index da9fbf8..6c14c3d 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -13,6 +13,10 @@ def __init__(self, expression, vector, **extra): vector = Value(SparseVector._to_db(vector)) else: vector = Value(Vector._to_db(vector)) + + # prevent error with unhashable types + self._constructor_args = ((expression, vector), extra) + super().__init__(expression, vector, **extra) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 51f7556..227edc1 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -24,6 +24,11 @@ def __init__(self, value): def __repr__(self): return f'Bit({self.to_text()})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + def to_list(self): return self._value.tolist() diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index e1e5051..f335f2f 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -16,6 +16,11 @@ def __init__(self, value): def __repr__(self): return f'HalfVector({self.to_list()})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + def dimensions(self): return len(self._value) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 0398106..8df2dfd 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -26,6 +26,11 @@ def __repr__(self): elements = dict(zip(self._indices, self._values)) return f'SparseVector({elements}, {self._dim})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.dimensions() == other.dimensions() and self.indices() == other.indices() and self.values() == other.values() + return False + def dimensions(self): return self._dim diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 3fa2f35..ebbcafd 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -16,6 +16,11 @@ def __init__(self, value): def __repr__(self): return f'Vector({self.to_list()})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + def dimensions(self): return len(self._value) diff --git a/tests/test_bit.py b/tests/test_bit.py index 32ab87b..a7e0093 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -37,3 +37,7 @@ def test_ndim_zero(self): def test_repr(self): assert repr(Bit([True, False, True])) == 'Bit(101)' assert str(Bit([True, False, True])) == 'Bit(101)' + + def test_equality(self): + assert Bit([True, False, True]) == Bit([True, False, True]) + assert Bit([True, False, True]) != Bit([True, False, False]) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index fdaa5f7..77a7869 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -38,5 +38,9 @@ def test_repr(self): assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' assert str(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' + def test_equality(self): + assert HalfVector([1, 2, 3]) == HalfVector([1, 2, 3]) + assert HalfVector([1, 2, 3]) != HalfVector([1, 2, 4]) + def test_dimensions(self): assert HalfVector([1, 2, 3]).dimensions() == 3 diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 06fe81a..24d8c20 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -52,6 +52,11 @@ def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' + def test_equality(self): + assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector([1, 0, 2, 0, 3, 0]) + assert SparseVector([1, 0, 2, 0, 3, 0]) != SparseVector([1, 0, 2, 0, 3, 1]) + assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + def test_dimensions(self): assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 diff --git a/tests/test_vector.py b/tests/test_vector.py index 1be2bc0..fe14dea 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -38,5 +38,9 @@ def test_repr(self): assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' assert str(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' + def test_equality(self): + assert Vector([1, 2, 3]) == Vector([1, 2, 3]) + assert Vector([1, 2, 3]) != Vector([1, 2, 4]) + def test_dimensions(self): assert Vector([1, 2, 3]).dimensions() == 3 From 838ea0c73b1669c94de274eccd82f58d83ea55b4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:57:03 -0800 Subject: [PATCH 079/123] Moved classes to pgvector module [skip ci] --- pgvector/__init__.py | 11 +++++++++++ pgvector/{utils => }/bit.py | 0 pgvector/{utils => }/halfvec.py | 0 pgvector/{utils => }/sparsevec.py | 0 pgvector/utils/__init__.py | 5 +---- pgvector/{utils => }/vector.py | 0 6 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 pgvector/__init__.py rename pgvector/{utils => }/bit.py (100%) rename pgvector/{utils => }/halfvec.py (100%) rename pgvector/{utils => }/sparsevec.py (100%) rename pgvector/{utils => }/vector.py (100%) diff --git a/pgvector/__init__.py b/pgvector/__init__.py new file mode 100644 index 0000000..3c01160 --- /dev/null +++ b/pgvector/__init__.py @@ -0,0 +1,11 @@ +from .bit import Bit +from .halfvec import HalfVector +from .sparsevec import SparseVector +from .vector import Vector + +__all__ = [ + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] diff --git a/pgvector/utils/bit.py b/pgvector/bit.py similarity index 100% rename from pgvector/utils/bit.py rename to pgvector/bit.py diff --git a/pgvector/utils/halfvec.py b/pgvector/halfvec.py similarity index 100% rename from pgvector/utils/halfvec.py rename to pgvector/halfvec.py diff --git a/pgvector/utils/sparsevec.py b/pgvector/sparsevec.py similarity index 100% rename from pgvector/utils/sparsevec.py rename to pgvector/sparsevec.py diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 3c01160..1dcc240 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,7 +1,4 @@ -from .bit import Bit -from .halfvec import HalfVector -from .sparsevec import SparseVector -from .vector import Vector +from .. import Bit, HalfVector, SparseVector, Vector __all__ = [ 'Vector', diff --git a/pgvector/utils/vector.py b/pgvector/vector.py similarity index 100% rename from pgvector/utils/vector.py rename to pgvector/vector.py From 0ac00b4e3d39ea1ddefd8573588f7de2e60d112f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:00:08 -0800 Subject: [PATCH 080/123] Improved imports for tests [skip ci] --- tests/test_asyncpg.py | 3 ++- tests/test_bit.py | 2 +- tests/test_django.py | 3 ++- tests/test_half_vector.py | 2 +- tests/test_peewee.py | 3 ++- tests/test_psycopg.py | 3 ++- tests/test_psycopg2.py | 3 ++- tests/test_sparse_vector.py | 2 +- tests/test_sqlalchemy.py | 3 ++- tests/test_sqlmodel.py | 3 ++- tests/test_vector.py | 2 +- 11 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 48d1e32..3c36048 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,6 +1,7 @@ import asyncpg import numpy as np -from pgvector.asyncpg import register_vector, SparseVector +from pgvector import SparseVector +from pgvector.asyncpg import register_vector import pytest diff --git a/tests/test_bit.py b/tests/test_bit.py index a7e0093..e0dcfe6 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import Bit +from pgvector import Bit import pytest diff --git a/tests/test_django.py b/tests/test_django.py index ea15771..65082a3 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -12,7 +12,8 @@ import numpy as np import os import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVector, SparseVector +from pgvector import HalfVector, SparseVector +from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance from unittest import mock settings.configure( diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 77a7869..6a94c2e 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import HalfVector +from pgvector import HalfVector import pytest diff --git a/tests/test_peewee.py b/tests/test_peewee.py index e98a0ec..670d880 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,8 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField, SparseVector +from pgvector import SparseVector +from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField db = PostgresqlDatabase('pgvector_python_test') diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index cf5f09a..6a9d0b7 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,5 +1,6 @@ import numpy as np -from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector, Vector +from pgvector import Bit, HalfVector, SparseVector, Vector +from pgvector.psycopg import register_vector, register_vector_async import psycopg from psycopg_pool import ConnectionPool, AsyncConnectionPool import pytest diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index f927d86..1994c87 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,6 @@ import numpy as np -from pgvector.psycopg2 import register_vector, Vector, HalfVector, SparseVector +from pgvector import HalfVector, SparseVector, Vector +from pgvector.psycopg2 import register_vector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor from psycopg2.pool import ThreadedConnectionPool diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 24d8c20..b5e7fe8 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import SparseVector +from pgvector import SparseVector import pytest from scipy.sparse import coo_array diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 067a153..052edd7 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,8 @@ import asyncpg import numpy as np import os -from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum +from pgvector import SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY from sqlalchemy.exc import StatementError diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 373834f..b0e8ccd 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,6 @@ import numpy as np -from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum +from pgvector import SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy.exc import StatementError from sqlmodel import Field, Index, Session, SQLModel, create_engine, delete, select, text diff --git a/tests/test_vector.py b/tests/test_vector.py index fe14dea..406637f 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import Vector +from pgvector import Vector import pytest From 435e31654831d303342a1100a8dd32b6c1fe42a6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:03:28 -0800 Subject: [PATCH 081/123] Improved imports --- pgvector/asyncpg/__init__.py | 4 +++- pgvector/asyncpg/register.py | 2 +- pgvector/django/__init__.py | 4 +++- pgvector/django/functions.py | 2 +- pgvector/django/halfvec.py | 2 +- pgvector/django/sparsevec.py | 2 +- pgvector/django/vector.py | 2 +- pgvector/peewee/__init__.py | 4 +++- pgvector/peewee/halfvec.py | 2 +- pgvector/peewee/sparsevec.py | 2 +- pgvector/peewee/vector.py | 2 +- pgvector/psycopg/__init__.py | 4 +++- pgvector/psycopg/bit.py | 2 +- pgvector/psycopg/halfvec.py | 2 +- pgvector/psycopg/sparsevec.py | 2 +- pgvector/psycopg/vector.py | 2 +- pgvector/psycopg2/__init__.py | 4 +++- pgvector/psycopg2/halfvec.py | 2 +- pgvector/psycopg2/sparsevec.py | 2 +- pgvector/psycopg2/vector.py | 2 +- pgvector/sqlalchemy/__init__.py | 4 +++- pgvector/sqlalchemy/halfvec.py | 2 +- pgvector/sqlalchemy/sparsevec.py | 2 +- pgvector/sqlalchemy/vector.py | 2 +- 24 files changed, 36 insertions(+), 24 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 543b882..c6a3b4e 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,5 +1,7 @@ from .register import register_vector -from ..utils import Vector, HalfVector, SparseVector + +# TODO remove +from .. import Vector, HalfVector, SparseVector __all__ = [ 'register_vector', diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py index a388058..63726f3 100644 --- a/pgvector/asyncpg/register.py +++ b/pgvector/asyncpg/register.py @@ -1,4 +1,4 @@ -from ..utils import Vector, HalfVector, SparseVector +from .. import Vector, HalfVector, SparseVector async def register_vector(conn, schema='public'): diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 09978a9..43c64a3 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -5,7 +5,9 @@ from .indexes import IvfflatIndex, HnswIndex from .sparsevec import SparseVectorField from .vector import VectorField -from ..utils import HalfVector, SparseVector + +# TODO remove +from .. import HalfVector, SparseVector __all__ = [ 'VectorExtension', diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index 6c14c3d..9df4fdb 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -1,5 +1,5 @@ from django.db.models import FloatField, Func, Value -from ..utils import Vector, HalfVector, SparseVector +from .. import Vector, HalfVector, SparseVector class DistanceBase(Func): diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 6b59a7f..3aeb90f 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -1,6 +1,6 @@ from django import forms from django.db.models import Field -from ..utils import HalfVector +from .. import HalfVector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index d0d2d07..580f27c 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -1,6 +1,6 @@ from django import forms from django.db.models import Field -from ..utils import SparseVector +from .. import SparseVector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py index a89d540..861cfde 100644 --- a/pgvector/django/vector.py +++ b/pgvector/django/vector.py @@ -1,7 +1,7 @@ from django import forms from django.db.models import Field import numpy as np -from ..utils import Vector +from .. import Vector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 945e0dc..df21200 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -2,7 +2,9 @@ from .halfvec import HalfVectorField from .sparsevec import SparseVectorField from .vector import VectorField -from ..utils import HalfVector, SparseVector + +# TODO remove +from .. import HalfVector, SparseVector __all__ = [ 'VectorField', diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index deaa14d..0901fd2 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -1,5 +1,5 @@ from peewee import Expression, Field -from ..utils import HalfVector +from .. import HalfVector class HalfVectorField(Field): diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index 67f7d1b..86dea73 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -1,5 +1,5 @@ from peewee import Expression, Field -from ..utils import SparseVector +from .. import SparseVector class SparseVectorField(Field): diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py index 22a87e5..83f9997 100644 --- a/pgvector/peewee/vector.py +++ b/pgvector/peewee/vector.py @@ -1,5 +1,5 @@ from peewee import Expression, Field -from ..utils import Vector +from .. import Vector class VectorField(Field): diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index 9007c37..980af84 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,5 +1,7 @@ from .register import register_vector, register_vector_async -from ..utils import Bit, HalfVector, SparseVector, Vector + +# TODO remove +from .. import Bit, HalfVector, SparseVector, Vector __all__ = [ 'register_vector', diff --git a/pgvector/psycopg/bit.py b/pgvector/psycopg/bit.py index f8eeb61..cffe8fb 100644 --- a/pgvector/psycopg/bit.py +++ b/pgvector/psycopg/bit.py @@ -1,6 +1,6 @@ from psycopg.adapt import Dumper from psycopg.pq import Format -from ..utils import Bit +from .. import Bit class BitDumper(Dumper): diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py index 351d2cb..b3a0060 100644 --- a/pgvector/psycopg/halfvec.py +++ b/pgvector/psycopg/halfvec.py @@ -1,6 +1,6 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import HalfVector +from .. import HalfVector class HalfVectorDumper(Dumper): diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py index 435fd06..384a0e1 100644 --- a/pgvector/psycopg/sparsevec.py +++ b/pgvector/psycopg/sparsevec.py @@ -1,6 +1,6 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import SparseVector +from .. import SparseVector class SparseVectorDumper(Dumper): diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index 0f62ca9..db9e826 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -1,7 +1,7 @@ import psycopg from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import Vector +from .. import Vector class VectorDumper(Dumper): diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index f109203..b40c673 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,5 +1,7 @@ from .register import register_vector -from ..utils import HalfVector, SparseVector, Vector + +# TODO remove +from .. import HalfVector, SparseVector, Vector __all__ = [ 'register_vector', diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index b50e89b..0a4c736 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -1,5 +1,5 @@ from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type -from ..utils import HalfVector +from .. import HalfVector class HalfvecAdapter: diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index a542807..148eff2 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -1,5 +1,5 @@ from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type -from ..utils import SparseVector +from .. import SparseVector class SparsevecAdapter: diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 5bd00bb..562de18 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -1,6 +1,6 @@ import numpy as np from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type -from ..utils import Vector +from .. import Vector class VectorAdapter: diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 4955eeb..52adf88 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -4,7 +4,9 @@ from .sparsevec import SPARSEVEC from .vector import VECTOR from .vector import VECTOR as Vector -from ..utils import HalfVector, SparseVector + +# TODO remove +from .. import HalfVector, SparseVector __all__ = [ 'Vector', diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py index 639f77b..10688b5 100644 --- a/pgvector/sqlalchemy/halfvec.py +++ b/pgvector/sqlalchemy/halfvec.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import HalfVector +from .. import HalfVector class HALFVEC(UserDefinedType): diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py index 370f5d1..0058679 100644 --- a/pgvector/sqlalchemy/sparsevec.py +++ b/pgvector/sqlalchemy/sparsevec.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import SparseVector +from .. import SparseVector class SPARSEVEC(UserDefinedType): diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py index f57a045..5a1e11f 100644 --- a/pgvector/sqlalchemy/vector.py +++ b/pgvector/sqlalchemy/vector.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import Vector +from .. import Vector class VECTOR(UserDefinedType): From 08e29e1acdcdf03965f7ffb4e1e552688df51785 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:04:40 -0800 Subject: [PATCH 082/123] Added todo [skip ci] --- pgvector/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 1dcc240..8cdb5d6 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,3 +1,4 @@ +# TODO remove from .. import Bit, HalfVector, SparseVector, Vector __all__ = [ From 1c0ff62b65718899915cd51466c63b9b60c3787f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:50:36 -0800 Subject: [PATCH 083/123] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1788ff4..df60740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.4.0 (unreleased) +- Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes - Dropped support for Python < 3.9 ## 0.3.6 (2024-10-26) From f618edb43395795451d3079b1def7f6c8cbb76ba Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:55:07 -0800 Subject: [PATCH 084/123] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index df60740..24aebf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.0 (unreleased) - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes +- Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases - Dropped support for Python < 3.9 ## 0.3.6 (2024-10-26) From 537f3ba72519314b2bc5b65f7d625594a496345e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 22:01:16 -0800 Subject: [PATCH 085/123] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24aebf9..42e9bff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.4.0 (unreleased) +- Added top-level `pgvector` package - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes - Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases - Dropped support for Python < 3.9 From 571bf4287cfe8a2371f477250c02efa3f62f67a1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 22:03:52 -0800 Subject: [PATCH 086/123] Updated changelog [skip ci] --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42e9bff..f53a2ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ ## 0.4.0 (unreleased) - Added top-level `pgvector` package +- Changed `globally` option to default to `False` for Psycopg 2 +- Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes - Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases - Dropped support for Python < 3.9 From 1676e3ead391493375ff6958a5b80c78080cf01e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 17:13:55 -0800 Subject: [PATCH 087/123] Test SQLAlchemy 1 on CI --- .github/workflows/build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dc53dfe..4d4e8ed 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,3 +24,6 @@ jobs: make sudo make install - run: pytest + + - run: pip install "SQLAlchemy<2" -U + - run: pytest tests/test_sqlalchemy.py From ac9fd532f77c1497df250e519238f7f5d627f645 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 17:49:10 -0800 Subject: [PATCH 088/123] Improved Bit constructor for uint8 NumPy arrays --- pgvector/bit.py | 11 ++++------- tests/test_bit.py | 4 +--- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 227edc1..36da723 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -7,14 +7,11 @@ def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value else: - # TODO change in 0.4.0 # TODO raise if dtype not bool or uint8 - # if isinstance(value, np.ndarray) and value.dtype == np.uint8: - # value = np.unpackbits(value) - # else: - # value = np.asarray(value, dtype=bool) - - value = np.asarray(value, dtype=bool) + if isinstance(value, np.ndarray) and value.dtype == np.uint8: + value = np.unpackbits(value) + else: + value = np.asarray(value, dtype=bool) if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index e0dcfe6..1d771ca 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -15,9 +15,7 @@ def test_str(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - # TODO change in 0.4.0 - # assert Bit(arr).to_text() == '111111100000011100000000' - assert Bit(arr).to_text() == '110' + assert Bit(arr).to_text() == '111111100000011100000000' def test_ndarray_same_object(self): arr = np.array([True, False, True]) From 1b25460e6184bb744b9c71c9c5b95852bdf7c63f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:02:23 -0800 Subject: [PATCH 089/123] Raise error for unexpected dtype for Bit constructor [skip ci] --- pgvector/bit.py | 8 +++++--- tests/test_bit.py | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 36da723..4be7385 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -7,9 +7,11 @@ def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value else: - # TODO raise if dtype not bool or uint8 - if isinstance(value, np.ndarray) and value.dtype == np.uint8: - value = np.unpackbits(value) + if isinstance(value, np.ndarray): + if value.dtype == np.uint8: + value = np.unpackbits(value).astype(bool) + elif value.dtype != np.bool: + raise ValueError('expected dtype to be bool or uint8') else: value = np.asarray(value, dtype=bool) diff --git a/tests/test_bit.py b/tests/test_bit.py index 1d771ca..5e1bff2 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -17,6 +17,12 @@ def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) assert Bit(arr).to_text() == '111111100000011100000000' + def test_ndarray_uint16(self): + arr = np.array([254, 7, 0], dtype=np.uint16) + with pytest.raises(ValueError) as error: + Bit(arr) + assert str(error.value) == 'expected dtype to be bool or uint8' + def test_ndarray_same_object(self): arr = np.array([True, False, True]) assert Bit(arr).to_list() == [True, False, True] From 8b927161f7856415436159d0b2c804280261a759 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:11:57 -0800 Subject: [PATCH 090/123] Improved asyncpg tests [skip ci] --- tests/test_asyncpg.py | 45 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 3c36048..982ea8d 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,6 +1,6 @@ import asyncpg import numpy as np -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector, Vector from pgvector.asyncpg import register_vector import pytest @@ -15,13 +15,15 @@ async def test_vector(self): await register_vector(conn) - embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + embedding = Vector([1.5, 2, 3]) + embedding2 = np.array([4.5, 5, 6]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert np.array_equal(res[0]['embedding'], embedding) + assert np.array_equal(res[0]['embedding'], embedding.to_numpy()) assert res[0]['embedding'].dtype == np.float32 - assert res[1]['embedding'] is None + assert np.array_equal(res[1]['embedding'], embedding2) + assert res[2]['embedding'] is None # ensures binary format is correct text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") @@ -38,12 +40,14 @@ async def test_halfvec(self): await register_vector(conn) - embedding = [1.5, 2, 3] - await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + embedding = HalfVector([1.5, 2, 3]) + embedding2 = [4.5, 5, 6] + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['embedding'].to_list() == [1.5, 2, 3] - assert res[1]['embedding'] is None + assert res[0]['embedding'] == embedding + assert res[1]['embedding'] == HalfVector(embedding2) + assert res[2]['embedding'] is None # ensures binary format is correct text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") @@ -87,7 +91,7 @@ async def test_sparsevec(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['embedding'].to_list() == [1.5, 2, 3] + assert res[0]['embedding'] == embedding assert res[1]['embedding'] is None # ensures binary format is correct @@ -105,12 +109,15 @@ async def test_vector_array(self): await register_vector(conn) - embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] - await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[])", embeddings[0], embeddings[1]) + embeddings = [Vector([1.5, 2, 3]), Vector([4.5, 5, 6])] + embeddings2 = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[]), (ARRAY[$3, $4]::vector[])", embeddings[0], embeddings[1], embeddings2[0], embeddings2[1]) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert np.array_equal(res[0]['embeddings'][0], embeddings[0]) - assert np.array_equal(res[0]['embeddings'][1], embeddings[1]) + assert np.array_equal(res[0]['embeddings'][0], embeddings[0].to_numpy()) + assert np.array_equal(res[0]['embeddings'][1], embeddings[1].to_numpy()) + assert np.array_equal(res[1]['embeddings'][0], embeddings2[0]) + assert np.array_equal(res[1]['embeddings'][1], embeddings2[1]) await conn.close() @@ -126,10 +133,12 @@ async def init(conn): await conn.execute('DROP TABLE IF EXISTS asyncpg_items') await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding vector(3))') - embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + embedding = Vector([1.5, 2, 3]) + embedding2 = np.array([1.5, 2, 3]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert np.array_equal(res[0]['embedding'], embedding) + assert np.array_equal(res[0]['embedding'], embedding.to_numpy()) assert res[0]['embedding'].dtype == np.float32 - assert res[1]['embedding'] is None + assert np.array_equal(res[1]['embedding'], embedding2) + assert res[2]['embedding'] is None From 9f825f2e8360a4f6ec8af0341584817e5191008c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:14:09 -0800 Subject: [PATCH 091/123] Improved asyncpg test [skip ci] --- tests/test_asyncpg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 982ea8d..34d66a1 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -110,8 +110,10 @@ async def test_vector_array(self): await register_vector(conn) embeddings = [Vector([1.5, 2, 3]), Vector([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES ($1)", embeddings) + embeddings2 = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] - await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[]), (ARRAY[$3, $4]::vector[])", embeddings[0], embeddings[1], embeddings2[0], embeddings2[1]) + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[])", embeddings2[0], embeddings2[1]) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert np.array_equal(res[0]['embeddings'][0], embeddings[0].to_numpy()) From bb3b32ccf9718c3675767de3e226d3638c1f82ea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:20:20 -0800 Subject: [PATCH 092/123] Improved tests [skip ci] --- tests/test_peewee.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 670d880..d7028c3 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField db = PostgresqlDatabase('pgvector_python_test') @@ -77,7 +77,7 @@ def test_vector_l1_distance(self): def test_halfvec(self): Item.create(id=1, half_embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self): create_items() @@ -129,7 +129,7 @@ def test_bit_jaccard_distance(self): def test_sparsevec(self): Item.create(id=1, sparse_embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self): create_items() @@ -186,7 +186,7 @@ def test_halfvec_avg(self): Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() - assert avg.to_list() == [2.5, 3.5, 4.5] + assert avg == HalfVector([2.5, 3.5, 4.5]) def test_halfvec_sum(self): sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() @@ -194,7 +194,7 @@ def test_halfvec_sum(self): Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() - assert sum.to_list() == [5, 7, 9] + assert sum == HalfVector([5, 7, 9]) def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) From c7cd058ea3145fd7cdcb45f712c0f4450ddbe16e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:24:06 -0800 Subject: [PATCH 093/123] Improved tests [skip ci] --- tests/test_django.py | 12 ++++++------ tests/test_psycopg.py | 8 ++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 65082a3..f187ad4 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -199,7 +199,7 @@ def test_vector_l1_distance(self): def test_halfvec(self): Item(id=1, half_embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self): create_items() @@ -251,7 +251,7 @@ def test_bit_jaccard_distance(self): def test_sparsevec(self): Item(id=1, sparse_embedding=SparseVector([1, 2, 3])).save() item = Item.objects.get(pk=1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self): create_items() @@ -309,7 +309,7 @@ def test_halfvec_avg(self): Item(half_embedding=[1, 2, 3]).save() Item(half_embedding=[4, 5, 6]).save() avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] - assert avg.to_list() == [2.5, 3.5, 4.5] + assert avg == HalfVector([2.5, 3.5, 4.5]) def test_halfvec_sum(self): sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] @@ -317,7 +317,7 @@ def test_halfvec_sum(self): Item(half_embedding=[1, 2, 3]).save() Item(half_embedding=[4, 5, 6]).save() sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] - assert sum.to_list() == [5, 7, 9] + assert sum == HalfVector([5, 7, 9]) def test_serialization(self): create_items() @@ -375,7 +375,7 @@ def test_halfvec_form_save(self): assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).half_embedding.to_list() + assert Item.objects.get(pk=1).half_embedding == HalfVector([4, 5, 6]) def test_halfvec_form_save_missing(self): Item(id=1).save() @@ -432,7 +432,7 @@ def test_sparsevec_form_save(self): assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).sparse_embedding.to_list() + assert Item.objects.get(pk=1).sparse_embedding == SparseVector([4, 5, 6]) def test_sparesevec_form_save_missing(self): Item(id=1).save() diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 6a9d0b7..e2a40b2 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -69,17 +69,19 @@ def test_halfvec(self): conn.execute('INSERT INTO psycopg_items (half_embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT half_embedding FROM psycopg_items ORDER BY id').fetchone()[0] - assert res.to_list() == [1.5, 2, 3] + assert res == HalfVector([1.5, 2, 3]) def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) @@ -106,11 +108,12 @@ def test_sparsevec(self): conn.execute('INSERT INTO psycopg_items (sparse_embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT sparse_embedding FROM psycopg_items ORDER BY id').fetchone()[0] - assert res.to_list() == [1.5, 2, 3] + assert res == SparseVector([1.5, 2, 3]) def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] + assert res == embedding assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] @@ -120,6 +123,7 @@ def test_sparsevec_binary_format(self): def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] + assert res == embedding assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] From 8441b463ccc8738a951dd8fd2c9ac8b8b292c774 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:26:05 -0800 Subject: [PATCH 094/123] Improved tests [skip ci] --- tests/test_psycopg2.py | 12 ++++++------ tests/test_sparse_vector.py | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 1994c87..71e0015 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -46,7 +46,7 @@ def test_halfvec(self): cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert res[0][0].to_list() == [1.5, 2, 3] + assert res[0][0] == HalfVector([1.5, 2, 3]) assert res[1][0] is None def test_bit(self): @@ -64,7 +64,7 @@ def test_sparsevec(self): cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert res[0][0].to_list() == [1.5, 2, 3] + assert res[0][0] == SparseVector([1.5, 2, 3]) assert res[1][0] is None def test_vector_array(self): @@ -82,8 +82,8 @@ def test_halfvec_array(self): cur.execute('SELECT half_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0].to_list() == [1.5, 2, 3] - assert res[0][1].to_list() == [4.5, 5, 6] + assert res[0][0] == HalfVector([1.5, 2, 3]) + assert res[0][1] == HalfVector([4.5, 5, 6]) def test_sparsevec_array(self): embeddings = [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] @@ -91,8 +91,8 @@ def test_sparsevec_array(self): cur.execute('SELECT sparse_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0].to_list() == [1.5, 2, 3] - assert res[0][1].to_list() == [4.5, 5, 6] + assert res[0][0] == SparseVector([1.5, 2, 3]) + assert res[0][1] == SparseVector([4.5, 5, 6]) def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index b5e7fe8..29c3ea7 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -56,6 +56,7 @@ def test_equality(self): assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector([1, 0, 2, 0, 3, 0]) assert SparseVector([1, 0, 2, 0, 3, 0]) != SparseVector([1, 0, 2, 0, 3, 1]) assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + assert SparseVector({}, 1) != SparseVector({}, 2) def test_dimensions(self): assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 From 6b8857a3146cf581bebcf32eb81a37135aa2fc15 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:28:20 -0800 Subject: [PATCH 095/123] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 052edd7..d791bd6 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import asyncpg import numpy as np import os -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY @@ -256,7 +256,7 @@ def test_halfvec(self, engine): session.add(Item(id=1, half_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self, engine): create_items() @@ -348,7 +348,7 @@ def test_sparsevec(self, engine): session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self, engine): create_items() @@ -551,8 +551,8 @@ def test_halfvec_array(self, engine): # this fails if the driver does not cast arrays item = session.get(Item, 1) - assert item.half_embeddings[0].to_list() == [1, 2, 3] - assert item.half_embeddings[1].to_list() == [4, 5, 6] + assert item.half_embeddings[0] == HalfVector([1, 2, 3]) + assert item.half_embeddings[1] == HalfVector([4, 5, 6]) @pytest.mark.parametrize('engine', async_engines) @@ -582,7 +582,7 @@ async def test_halfvec(self, engine): embedding = [1, 2, 3] session.add(Item(id=1, half_embedding=embedding)) item = await session.get(Item, 1) - assert item.half_embedding.to_list() == embedding + assert item.half_embedding == HalfVector(embedding) await engine.dispose() @@ -608,7 +608,7 @@ async def test_sparsevec(self, engine): embedding = [1, 2, 3] session.add(Item(id=1, sparse_embedding=embedding)) item = await session.get(Item, 1) - assert item.sparse_embedding.to_list() == embedding + assert item.sparse_embedding == SparseVector(embedding) await engine.dispose() From 022dd061b4ebdfb7e39b23abe782bc2d89ec9e98 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:29:57 -0800 Subject: [PATCH 096/123] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index d791bd6..4b1e516 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import asyncpg import numpy as np import os -from pgvector import HalfVector, SparseVector +from pgvector import HalfVector, SparseVector, Vector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY @@ -637,9 +637,14 @@ async def test_vector_array(self, engine): async with async_session() as session: async with session.begin(): - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.add(Item(id=1, embeddings=[Vector([1, 2, 3]), Vector([4, 5, 6])])) item = await session.get(Item, 1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + session.add(Item(id=2, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + item = await session.get(Item, 2) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] + await engine.dispose() From bb02ee2742714cb4b566b95deb71a82539977dd4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:48:17 -0800 Subject: [PATCH 097/123] Improved tests [skip ci] --- tests/test_sqlmodel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index b0e8ccd..8a472b1 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy.exc import StatementError @@ -107,7 +107,7 @@ def test_halfvec(self): session.add(Item(id=1, half_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self): create_items() @@ -157,7 +157,7 @@ def test_sparsevec(self): session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self): create_items() @@ -220,7 +220,7 @@ def test_halfvec_avg(self): session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) res = session.exec(select(avg(Item.half_embedding))).first() - assert res.to_list() == [2.5, 3.5, 4.5] + assert res == HalfVector([2.5, 3.5, 4.5]) def test_halfvec_sum(self): with Session(engine) as session: @@ -229,7 +229,7 @@ def test_halfvec_sum(self): session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) res = session.exec(select(sum(Item.half_embedding))).first() - assert res.to_list() == [5, 7, 9] + assert res == HalfVector([5, 7, 9]) def test_bad_dimensions(self): item = Item(embedding=[1, 2]) From 340caa58195fc5e7b99eed8ab7fe4e4e912fd73c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:53:17 -0800 Subject: [PATCH 098/123] Improved tests [skip ci] --- tests/test_psycopg.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index e2a40b2..0859be7 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -75,6 +75,7 @@ def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) + # TODO move assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) @@ -82,6 +83,7 @@ def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) + # TODO move assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) @@ -114,6 +116,7 @@ def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res == embedding + # TODO move assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] @@ -124,6 +127,7 @@ def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res == embedding + # TODO move assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] @@ -166,8 +170,8 @@ def test_binary_copy_to(self): cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: for row in copy.rows(): - assert Vector.from_binary(row[0]).to_list() == [1.5, 2, 3] - assert HalfVector.from_binary(row[1]).to_list() == [1.5, 2, 3] + assert np.array_equal(Vector.from_binary(row[0]).to_numpy(), embedding) + assert HalfVector.from_binary(row[1]) == half_embedding def test_binary_copy_to_set_types(self): embedding = np.array([1.5, 2, 3]) @@ -178,7 +182,7 @@ def test_binary_copy_to_set_types(self): copy.set_types(['vector', 'halfvec']) for row in copy.rows(): assert np.array_equal(row[0], embedding) - assert row[1].to_list() == [1.5, 2, 3] + assert row[1] == half_embedding def test_vector_array(self): embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] From e6edb2a68f1a93df94c725c6d5ba29654694feab Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:56:52 -0800 Subject: [PATCH 099/123] Improved tests [skip ci] --- tests/test_half_vector.py | 5 +++++ tests/test_psycopg.py | 9 --------- tests/test_sparse_vector.py | 8 ++++++++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 6a94c2e..a17699a 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -44,3 +44,8 @@ def test_equality(self): def test_dimensions(self): assert HalfVector([1, 2, 3]).dimensions() == 3 + + def test_from_text(self): + vec = HalfVector.from_text('[1.5,2,3]') + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 0859be7..24ab321 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -83,9 +83,6 @@ def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) - # TODO move - assert res.to_list() == [1.5, 2, 3] - assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_bit(self): embedding = Bit([True, False, True]) @@ -127,12 +124,6 @@ def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res == embedding - # TODO move - assert res.dimensions() == 6 - assert res.indices() == [0, 2, 4] - assert res.values() == [1.5, 2, 3] - assert res.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_text_copy_from(self): embedding = np.array([1.5, 2, 3]) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 29c3ea7..fb01b5e 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -73,3 +73,11 @@ def test_to_coo(self): def test_zero_vector_text(self): vec = SparseVector({}, 3) assert vec.to_list() == SparseVector.from_text(vec.to_text()).to_list() + + def test_from_text(self): + vec = SparseVector.from_text('{1:1.5,3:2,5:3}/6') + assert vec.dimensions() == 6 + assert vec.indices() == [0, 2, 4] + assert vec.values() == [1.5, 2, 3] + assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) From b57a2e9ed35c68eaa22afe27ce93401b0190adc2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:06:11 -0800 Subject: [PATCH 100/123] Improved tests [skip ci] --- tests/test_half_vector.py | 8 ++++++++ tests/test_psycopg.py | 9 --------- tests/test_sparse_vector.py | 11 +++++++++++ tests/test_vector.py | 13 +++++++++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index a17699a..9c0b041 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -1,6 +1,7 @@ import numpy as np from pgvector import HalfVector import pytest +from struct import pack class TestHalfVector: @@ -49,3 +50,10 @@ def test_from_text(self): vec = HalfVector.from_text('[1.5,2,3]') assert vec.to_list() == [1.5, 2, 3] assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + + def test_from_binary(self): + data = pack('>HH3e', 3, 0, *[1.5, 2, 3]) + vec = HalfVector.from_binary(data) + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert vec.to_binary() == data diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 24ab321..f61b4e3 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -75,9 +75,6 @@ def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) - # TODO move - assert res.to_list() == [1.5, 2, 3] - assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) @@ -113,12 +110,6 @@ def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res == embedding - # TODO move - assert res.dimensions() == 6 - assert res.indices() == [0, 2, 4] - assert res.values() == [1.5, 2, 3] - assert res.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index fb01b5e..fb51db9 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -2,6 +2,7 @@ from pgvector import SparseVector import pytest from scipy.sparse import coo_array +from struct import pack class TestSparseVector: @@ -81,3 +82,13 @@ def test_from_text(self): assert vec.values() == [1.5, 2, 3] assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + + def test_from_binary(self): + data = pack('>iii3i3f', 6, 3, 0, *[0, 2, 4], *[1.5, 2, 3]) + vec = SparseVector.from_binary(data) + assert vec.dimensions() == 6 + assert vec.indices() == [0, 2, 4] + assert vec.values() == [1.5, 2, 3] + assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + assert vec.to_binary() == data diff --git a/tests/test_vector.py b/tests/test_vector.py index 406637f..094dd34 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -1,6 +1,7 @@ import numpy as np from pgvector import Vector import pytest +from struct import pack class TestVector: @@ -44,3 +45,15 @@ def test_equality(self): def test_dimensions(self): assert Vector([1, 2, 3]).dimensions() == 3 + + def test_from_text(self): + vec = Vector.from_text('[1.5,2,3]') + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + + def test_from_binary(self): + data = pack('>HH3f', 3, 0, *[1.5, 2, 3]) + vec = Vector.from_binary(data) + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert vec.to_binary() == data From b6ccb3043fb1dac552b4dcdf6ecb947434d3b234 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:06:51 -0800 Subject: [PATCH 101/123] Improved tests [skip ci] --- tests/test_half_vector.py | 2 +- tests/test_sparse_vector.py | 2 +- tests/test_vector.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 9c0b041..756adc2 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -52,7 +52,7 @@ def test_from_text(self): assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) def test_from_binary(self): - data = pack('>HH3e', 3, 0, *[1.5, 2, 3]) + data = pack('>HH3e', 3, 0, 1.5, 2, 3) vec = HalfVector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index fb51db9..cf5b016 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -84,7 +84,7 @@ def test_from_text(self): assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_from_binary(self): - data = pack('>iii3i3f', 6, 3, 0, *[0, 2, 4], *[1.5, 2, 3]) + data = pack('>iii3i3f', 6, 3, 0, 0, 2, 4, 1.5, 2, 3) vec = SparseVector.from_binary(data) assert vec.dimensions() == 6 assert vec.indices() == [0, 2, 4] diff --git a/tests/test_vector.py b/tests/test_vector.py index 094dd34..c367a7a 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -52,7 +52,7 @@ def test_from_text(self): assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) def test_from_binary(self): - data = pack('>HH3f', 3, 0, *[1.5, 2, 3]) + data = pack('>HH3f', 3, 0, 1.5, 2, 3) vec = Vector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) From e566d4c9b4968b232c2348e9e608d06ee90b6253 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:20:13 -0800 Subject: [PATCH 102/123] Improved tests [skip ci] --- tests/test_django.py | 12 ++++++------ tests/test_half_vector.py | 4 ++-- tests/test_peewee.py | 10 +++++----- tests/test_psycopg.py | 12 ++++++------ tests/test_psycopg2.py | 2 +- tests/test_sparse_vector.py | 8 ++++---- tests/test_sqlalchemy.py | 18 +++++++++--------- tests/test_sqlmodel.py | 2 +- tests/test_vector.py | 4 ++-- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index f187ad4..7a8a6eb 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -165,7 +165,7 @@ def setup_method(self): def test_vector(self): Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - assert np.array_equal(item.embedding, np.array([1, 2, 3])) + assert np.array_equal(item.embedding, [1, 2, 3]) assert item.embedding.dtype == np.float32 def test_vector_l2_distance(self): @@ -293,7 +293,7 @@ def test_vector_avg(self): Item(embedding=[1, 2, 3]).save() Item(embedding=[4, 5, 6]).save() avg = Item.objects.aggregate(Avg('embedding'))['embedding__avg'] - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + assert np.array_equal(avg, [2.5, 3.5, 4.5]) def test_vector_sum(self): sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] @@ -301,7 +301,7 @@ def test_vector_sum(self): Item(embedding=[1, 2, 3]).save() Item(embedding=[4, 5, 6]).save() sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] - assert np.array_equal(sum, np.array([5, 7, 9])) + assert np.array_equal(sum, [5, 7, 9]) def test_halfvec_avg(self): avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] @@ -347,7 +347,7 @@ def test_vector_form_save(self): assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).embedding.tolist() + assert np.array_equal(Item.objects.get(pk=1).embedding, [4, 5, 6]) def test_vector_form_save_missing(self): Item(id=1).save() @@ -465,8 +465,8 @@ def test_vector_array(self): # this fails if the driver does not cast arrays item = Item.objects.get(pk=1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) def test_double_array(self): Item(id=1, double_embedding=[1, 1, 1]).save() diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 756adc2..78b4977 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -49,11 +49,11 @@ def test_dimensions(self): def test_from_text(self): vec = HalfVector.from_text('[1.5,2,3]') assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) def test_from_binary(self): data = pack('>HH3e', 3, 0, 1.5, 2, 3) vec = HalfVector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) assert vec.to_binary() == data diff --git a/tests/test_peewee.py b/tests/test_peewee.py index d7028c3..64fc009 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -43,7 +43,7 @@ def setup_method(self): def test_vector(self): Item.create(id=1, embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert np.array_equal(item.embedding, np.array([1, 2, 3])) + assert np.array_equal(item.embedding, [1, 2, 3]) assert item.embedding.dtype == np.float32 def test_vector_l2_distance(self): @@ -170,7 +170,7 @@ def test_vector_avg(self): Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) avg = Item.select(fn.avg(Item.embedding).coerce(True)).scalar() - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + assert np.array_equal(avg, [2.5, 3.5, 4.5]) def test_vector_sum(self): sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() @@ -178,7 +178,7 @@ def test_vector_sum(self): Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() - assert np.array_equal(sum, np.array([5, 7, 9])) + assert np.array_equal(sum, [5, 7, 9]) def test_halfvec_avg(self): avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() @@ -220,5 +220,5 @@ class Meta: # fails with column "embeddings" is of type vector[] but expression is of type text[] # ExtItem.create(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]) # item = ExtItem.get_by_id(1) - # assert np.array_equal(item.embeddings[0], np.array([1, 2, 3])) - # assert np.array_equal(item.embeddings[1], np.array([4, 5, 6])) + # assert np.array_equal(item.embeddings[0], [1, 2, 3]) + # assert np.array_equal(item.embeddings[1], [4, 5, 6]) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index f61b4e3..698b34f 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -46,23 +46,23 @@ def test_vector_text_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([3, 2, 1.5])) + assert np.array_equal(res, [3, 2, 1.5]) def test_vector_binary_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([3, 2, 1.5])) + assert np.array_equal(res, [3, 2, 1.5]) def test_vector_class_binary_format(self): embedding = Vector([1.5, 2, 3]) res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] - assert np.array_equal(res, np.array([1.5, 2, 3])) + assert np.array_equal(res, [1.5, 2, 3]) def test_vector_class_text_format(self): embedding = Vector([1.5, 2, 3]) res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([1.5, 2, 3])) + assert np.array_equal(res, [1.5, 2, 3]) def test_halfvec(self): embedding = HalfVector([1.5, 2, 3]) @@ -182,7 +182,7 @@ def configure(conn): with pool.connection() as conn: res = conn.execute("SELECT '[1,2,3]'::vector").fetchone() - assert np.array_equal(res[0], np.array([1, 2, 3])) + assert np.array_equal(res[0], [1, 2, 3]) pool.close() @@ -218,6 +218,6 @@ async def configure(conn): async with conn.cursor() as cur: await cur.execute("SELECT '[1,2,3]'::vector") res = await cur.fetchone() - assert np.array_equal(res[0], np.array([1, 2, 3])) + assert np.array_equal(res[0], [1, 2, 3]) await pool.close() diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 71e0015..8f56ef5 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -122,7 +122,7 @@ def test_pool(self): cur = conn.cursor() cur.execute("SELECT '[1,2,3]'::vector") res = cur.fetchone() - assert np.array_equal(res[0], np.array([1, 2, 3])) + assert np.array_equal(res[0], [1, 2, 3]) finally: pool.putconn(conn) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index cf5b016..dff03dd 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -9,7 +9,7 @@ class TestSparseVector: def test_list(self): vec = SparseVector([1, 0, 2, 0, 3, 0]) assert vec.to_list() == [1, 0, 2, 0, 3, 0] - assert vec.to_numpy().tolist() == [1, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), [1, 0, 2, 0, 3, 0]) assert vec.indices() == [0, 2, 4] def test_list_dimensions(self): @@ -69,7 +69,7 @@ def test_values(self): assert SparseVector([1, 0, 2, 0, 3, 0]).values() == [1, 2, 3] def test_to_coo(self): - assert SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] + assert np.array_equal(SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray(), [[1, 0, 2, 0, 3, 0]]) def test_zero_vector_text(self): vec = SparseVector({}, 3) @@ -81,7 +81,7 @@ def test_from_text(self): assert vec.indices() == [0, 2, 4] assert vec.values() == [1.5, 2, 3] assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + assert np.array_equal(vec.to_numpy(), [1.5, 0, 2, 0, 3, 0]) def test_from_binary(self): data = pack('>iii3i3f', 6, 3, 0, 0, 2, 4, 1.5, 2, 3) @@ -90,5 +90,5 @@ def test_from_binary(self): assert vec.indices() == [0, 2, 4] assert vec.values() == [1.5, 2, 3] assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + assert np.array_equal(vec.to_numpy(), [1.5, 0, 2, 0, 3, 0]) assert vec.to_binary() == data diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 4b1e516..41c309f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -201,7 +201,7 @@ def test_vector(self, engine): session.add(Item(id=1, embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + assert np.array_equal(item.embedding, [1, 2, 3]) def test_vector_l2_distance(self, engine): create_items() @@ -509,7 +509,7 @@ def test_automap(self, engine): with Session(engine) as session: session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) item = session.query(AutoItem).first() - assert item.embedding.tolist() == [1, 2, 3] + assert np.array_equal(item.embedding, [1, 2, 3]) def test_half_precision(self, engine): create_items() @@ -541,8 +541,8 @@ def test_vector_array(self, engine): # this fails if the driver does not cast arrays item = session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) def test_halfvec_array(self, engine): with Session(engine) as session: @@ -621,7 +621,7 @@ async def test_avg(self, engine): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) res = await session.scalars(select(avg(Item.embedding))) - assert res.first().tolist() == [2.5, 3.5, 4.5] + assert np.array_equal(res.first(), [2.5, 3.5, 4.5]) await engine.dispose() @@ -639,12 +639,12 @@ async def test_vector_array(self, engine): async with session.begin(): session.add(Item(id=1, embeddings=[Vector([1, 2, 3]), Vector([4, 5, 6])])) item = await session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) session.add(Item(id=2, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) item = await session.get(Item, 2) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) await engine.dispose() diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 8a472b1..f4994f4 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -76,7 +76,7 @@ def test_vector(self): session.add(Item(id=1, embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + assert np.array_equal(item.embedding, np.array([1, 2, 3])) def test_vector_l2_distance(self): create_items() diff --git a/tests/test_vector.py b/tests/test_vector.py index c367a7a..e5a16fe 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -49,11 +49,11 @@ def test_dimensions(self): def test_from_text(self): vec = Vector.from_text('[1.5,2,3]') assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) def test_from_binary(self): data = pack('>HH3f', 3, 0, 1.5, 2, 3) vec = Vector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) assert vec.to_binary() == data From 057eff226bdb992ebdd952628bf3d54996d9437d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:38:31 -0800 Subject: [PATCH 103/123] Improved tests [skip ci] --- tests/test_psycopg2.py | 6 ++---- tests/test_sqlalchemy.py | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 8f56ef5..3e5c8c3 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -82,8 +82,7 @@ def test_halfvec_array(self): cur.execute('SELECT half_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0] == HalfVector([1.5, 2, 3]) - assert res[0][1] == HalfVector([4.5, 5, 6]) + assert res[0] == [HalfVector([1.5, 2, 3]), HalfVector([4.5, 5, 6])] def test_sparsevec_array(self): embeddings = [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] @@ -91,8 +90,7 @@ def test_sparsevec_array(self): cur.execute('SELECT sparse_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0] == SparseVector([1.5, 2, 3]) - assert res[0][1] == SparseVector([4.5, 5, 6]) + assert res[0] == [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 41c309f..0d8d1ca 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -551,8 +551,7 @@ def test_halfvec_array(self, engine): # this fails if the driver does not cast arrays item = session.get(Item, 1) - assert item.half_embeddings[0] == HalfVector([1, 2, 3]) - assert item.half_embeddings[1] == HalfVector([4, 5, 6]) + assert item.half_embeddings == [HalfVector([1, 2, 3]), HalfVector([4, 5, 6])] @pytest.mark.parametrize('engine', async_engines) From 8443ff519ac39a9f0b9b2c7233b33accbe6f63ae Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 20:05:01 -0800 Subject: [PATCH 104/123] Added missing dependency for example [skip ci] --- examples/implicit/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/implicit/requirements.txt b/examples/implicit/requirements.txt index 8f04b58..424abbd 100644 --- a/examples/implicit/requirements.txt +++ b/examples/implicit/requirements.txt @@ -1,3 +1,4 @@ +h5py implicit pgvector psycopg[binary] From 2496340bc5e91a0b5cad2462f276c7b488f2e36a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Feb 2025 16:45:35 -0800 Subject: [PATCH 105/123] Added support for pg8000 --- CHANGELOG.md | 1 + README.md | 48 ++++++++++++++++++++++++++++- pgvector/pg8000/__init__.py | 5 ++++ pgvector/pg8000/register.py | 23 ++++++++++++++ tests/test_pg8000.py | 60 +++++++++++++++++++++++++++++++++++++ 5 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 pgvector/pg8000/__init__.py create mode 100644 pgvector/pg8000/register.py create mode 100644 tests/test_pg8000.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f53a2ce..ebc165a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.0 (unreleased) - Added top-level `pgvector` package +- Added support for pg8000 - Changed `globally` option to default to `False` for Psycopg 2 - Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes diff --git a/README.md b/README.md index 5a59c9d..7f980bd 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [pgvector](https://github.com/pgvector/pgvector) support for Python -Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), and [Peewee](https://github.com/coleifer/peewee) +Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), [pg8000](https://github.com/tlocke/pg8000), and [Peewee](https://github.com/coleifer/peewee) [![Build Status](https://github.com/pgvector/pgvector-python/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-python/actions) @@ -22,6 +22,7 @@ And follow the instructions for your database library: - [Psycopg 3](#psycopg-3) - [Psycopg 2](#psycopg-2) - [asyncpg](#asyncpg) +- [pg8000](#pg8000) [unreleased] - [Peewee](#peewee) Or check out some examples: @@ -562,6 +563,51 @@ await conn.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +## pg8000 + +Enable the extension + +```python +conn.run('CREATE EXTENSION IF NOT EXISTS vector') +``` + +Register the vector type with your connection + +```python +from pgvector.pg8000 import register_vector + +register_vector(conn) +``` + +Create a table + +```python +conn.run('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +``` + +Insert a vector + +```python +embedding = np.array([1, 2, 3]) +conn.run('INSERT INTO items (embedding) VALUES (:embedding)', embedding=embedding) +``` + +Get the nearest neighbors to a vector + +```python +conn.run('SELECT * FROM items ORDER BY embedding <-> :embedding LIMIT 5', embedding=embedding) +``` + +Add an approximate index + +```python +conn.run('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +conn.run('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Peewee Add a vector column diff --git a/pgvector/pg8000/__init__.py b/pgvector/pg8000/__init__.py new file mode 100644 index 0000000..b3b4440 --- /dev/null +++ b/pgvector/pg8000/__init__.py @@ -0,0 +1,5 @@ +from .register import register_vector + +__all__ = [ + 'register_vector' +] diff --git a/pgvector/pg8000/register.py b/pgvector/pg8000/register.py new file mode 100644 index 0000000..15ee219 --- /dev/null +++ b/pgvector/pg8000/register.py @@ -0,0 +1,23 @@ +import numpy as np +from .. import Vector, HalfVector, SparseVector + + +def register_vector(conn): + # use to_regtype to get first matching type in search path + res = conn.run("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") + type_info = dict(res) + + if 'vector' not in type_info: + raise RuntimeError('vector type not found in the database') + + conn.register_out_adapter(Vector, Vector._to_db) + conn.register_out_adapter(np.ndarray, Vector._to_db) + conn.register_in_adapter(type_info['vector'], Vector._from_db) + + if 'halfvec' in type_info: + conn.register_out_adapter(HalfVector, HalfVector._to_db) + conn.register_in_adapter(type_info['halfvec'], HalfVector._from_db) + + if 'sparsevec' in type_info: + conn.register_out_adapter(SparseVector, SparseVector._to_db) + conn.register_in_adapter(type_info['sparsevec'], SparseVector._from_db) diff --git a/tests/test_pg8000.py b/tests/test_pg8000.py new file mode 100644 index 0000000..86c0fb1 --- /dev/null +++ b/tests/test_pg8000.py @@ -0,0 +1,60 @@ +import numpy as np +import os +from pgvector import HalfVector, SparseVector, Vector +from pgvector.pg8000 import register_vector +from pg8000.native import Connection + +conn = Connection(os.environ["USER"], database='pgvector_python_test') + +conn.run('CREATE EXTENSION IF NOT EXISTS vector') +conn.run('DROP TABLE IF EXISTS pg8000_items') +conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') + +register_vector(conn) + + +class TestPg8000: + def setup_method(self): + conn.run('DELETE FROM pg8000_items') + + def test_vector(self): + embedding = np.array([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT embedding FROM pg8000_items ORDER BY id') + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_vector_class(self): + embedding = Vector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT embedding FROM pg8000_items ORDER BY id') + assert np.array_equal(res[0][0], embedding.to_numpy()) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_halfvec(self): + embedding = HalfVector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (half_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT half_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == embedding + assert res[1][0] is None + + def test_bit(self): + embedding = '101' + conn.run('INSERT INTO pg8000_items (binary_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT binary_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == '101' + assert res[1][0] is None + + def test_sparsevec(self): + embedding = SparseVector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (sparse_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT sparse_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == embedding + assert res[1][0] is None From df1766b7f9ed6320958c04caf7f1b832d5320e4b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Feb 2025 16:59:42 -0800 Subject: [PATCH 106/123] Simplified test code [skip ci] --- tests/test_pg8000.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pg8000.py b/tests/test_pg8000.py index 86c0fb1..4d3e474 100644 --- a/tests/test_pg8000.py +++ b/tests/test_pg8000.py @@ -8,7 +8,7 @@ conn.run('CREATE EXTENSION IF NOT EXISTS vector') conn.run('DROP TABLE IF EXISTS pg8000_items') -conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') +conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') register_vector(conn) From 70ff5d4765bb156a45d806d3cd171b3a38f03fca Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Feb 2025 17:00:58 -0800 Subject: [PATCH 107/123] Improved tests [skip ci] --- tests/test_psycopg2.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 3e5c8c3..7f4932d 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -49,6 +49,15 @@ def test_halfvec(self): assert res[0][0] == HalfVector([1.5, 2, 3]) assert res[1][0] is None + def test_halfvec_class(self): + embedding = HalfVector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == embedding + assert res[1][0] is None + def test_bit(self): embedding = '101' cur.execute('INSERT INTO psycopg2_items (binary_embedding) VALUES (%s), (NULL)', (embedding,)) From ac9e398f511ca65f11f62f4296e94f2106367936 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 16 Feb 2025 13:15:19 -0800 Subject: [PATCH 108/123] Improved examples [skip ci] --- examples/cohere/example.py | 6 +++--- examples/openai/example.py | 25 ++++++++++++++--------- examples/sentence_transformers/example.py | 14 ++++++------- examples/sparse_search/example.py | 6 +++--- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/examples/cohere/example.py b/examples/cohere/example.py index 780352a..393d1e0 100644 --- a/examples/cohere/example.py +++ b/examples/cohere/example.py @@ -12,7 +12,7 @@ conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1024))') -def fetch_embeddings(input, input_type): +def embed(input, input_type): co = cohere.Client() response = co.embed(texts=input, model='embed-english-v3.0', input_type=input_type, embedding_types=['ubinary']) return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] @@ -23,12 +23,12 @@ def fetch_embeddings(input, input_type): 'The cat is purring', 'The bear is growling' ] -embeddings = fetch_embeddings(input, 'search_document') +embeddings = embed(input, 'search_document') for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, Bit(embedding))) query = 'forest' -query_embedding = fetch_embeddings([query], 'search_query')[0] +query_embedding = embed([query], 'search_query')[0] result = conn.execute('SELECT content FROM documents ORDER BY embedding <~> %s LIMIT 5', (Bit(query_embedding),)).fetchall() for row in result: print(row[0]) diff --git a/examples/openai/example.py b/examples/openai/example.py index ebed3d0..b9a078c 100644 --- a/examples/openai/example.py +++ b/examples/openai/example.py @@ -1,3 +1,4 @@ +import numpy as np from openai import OpenAI from pgvector.psycopg import register_vector import psycopg @@ -10,20 +11,24 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))') + +def embed(input): + client = OpenAI() + response = client.embeddings.create(input=input, model='text-embedding-3-small') + return [v.embedding for v in response.data] + + input = [ 'The dog is barking', 'The cat is purring', 'The bear is growling' ] - -client = OpenAI() -response = client.embeddings.create(input=input, model='text-embedding-3-small') -embeddings = [v.embedding for v in response.data] - +embeddings = embed(input) for content, embedding in zip(input, embeddings): - conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, np.array(embedding))) -document_id = 1 -neighbors = conn.execute('SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5', {'id': document_id}).fetchall() -for neighbor in neighbors: - print(neighbor[0]) +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (np.array(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/sentence_transformers/example.py b/examples/sentence_transformers/example.py index d4e7f96..3a7dca5 100644 --- a/examples/sentence_transformers/example.py +++ b/examples/sentence_transformers/example.py @@ -10,19 +10,19 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))') +model = SentenceTransformer('all-MiniLM-L6-v2') + input = [ 'The dog is barking', 'The cat is purring', 'The bear is growling' ] - -model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(input) - for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) -document_id = 1 -neighbors = conn.execute('SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5', {'id': document_id}).fetchall() -for neighbor in neighbors: - print(neighbor[0]) +query = 'forest' +query_embedding = model.encode(query) +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (query_embedding,)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/sparse_search/example.py b/examples/sparse_search/example.py index fa6074e..2b5daea 100644 --- a/examples/sparse_search/example.py +++ b/examples/sparse_search/example.py @@ -20,7 +20,7 @@ special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()] -def fetch_embeddings(input): +def embed(input): feature = tokenizer( input, padding=True, @@ -42,12 +42,12 @@ def fetch_embeddings(input): 'The cat is purring', 'The bear is growling' ] -embeddings = fetch_embeddings(input) +embeddings = embed(input) for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector(embedding))) query = 'forest' -query_embedding = fetch_embeddings([query])[0] +query_embedding = embed([query])[0] result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector(query_embedding),)).fetchall() for row in result: print(row[0]) From 1443c3c3ca11b9efadb07612758c2ba62fb4ec65 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 16 Feb 2025 13:50:50 -0800 Subject: [PATCH 109/123] Added halfvec example for OpenAI [skip ci] --- examples/openai/halfvec.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples/openai/halfvec.py diff --git a/examples/openai/halfvec.py b/examples/openai/halfvec.py new file mode 100644 index 0000000..185c785 --- /dev/null +++ b/examples/openai/halfvec.py @@ -0,0 +1,34 @@ +from openai import OpenAI +from pgvector.psycopg import register_vector, HalfVector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding halfvec(3072))') +conn.execute('CREATE INDEX ON documents USING hnsw (embedding halfvec_cosine_ops)') + + +def embed(input): + client = OpenAI() + response = client.embeddings.create(input=input, model='text-embedding-3-large') + return [v.embedding for v in response.data] + + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = embed(input) +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, HalfVector(embedding))) + +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (HalfVector(query_embedding),)).fetchall() +for row in result: + print(row[0]) From 12146d74db24514831138b43ec69273e289cde1a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 16 Feb 2025 18:34:41 -0800 Subject: [PATCH 110/123] Improved example [skip ci] --- examples/sentence_transformers/example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sentence_transformers/example.py b/examples/sentence_transformers/example.py index 3a7dca5..50997d9 100644 --- a/examples/sentence_transformers/example.py +++ b/examples/sentence_transformers/example.py @@ -10,7 +10,7 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))') -model = SentenceTransformer('all-MiniLM-L6-v2') +model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') input = [ 'The dog is barking', From 78466224ec95a38441240753f090625056b87b1e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 19 Feb 2025 15:54:25 -0800 Subject: [PATCH 111/123] Added reference section to readme [skip ci] --- README.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/README.md b/README.md index 7f980bd..299753e 100644 --- a/README.md +++ b/README.md @@ -665,6 +665,99 @@ Item.add_index('embedding vector_l2_ops', using='hnsw') Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +## Reference + +### Half Vectors + +Create a half vector from a list + +```python +vec = HalfVector([1, 2, 3]) +``` + +Or a NumPy array + +```python +vec = HalfVector(np.array([1, 2, 3])) +``` + +Get a list + +```python +lst = vec.to_list() +``` + +Get a NumPy array + +```python +arr = vec.to_numpy() +``` + +### Sparse Vectors + +Create a sparse vector from a list + +```python +vec = SparseVector([1, 0, 2, 0, 3, 0]) +``` + +Or a NumPy array + +```python +vec = SparseVector(np.array([1, 0, 2, 0, 3, 0])) +``` + +Or a SciPy sparse array + +```python +arr = coo_array(([1, 2, 3], ([0, 2, 4],)), shape=(6,)) +vec = SparseVector(arr) +``` + +Or a dictionary of non-zero elements + +```python +vec = SparseVector({0: 1, 2: 2, 4: 3}, 6) +``` + +Note: Indices start at 0 + +Get the number of dimensions + +```python +dim = vec.dimensions() +``` + +Get the indices of non-zero elements + +```python +indices = vec.indices() +``` + +Get the values of non-zero elements + +```python +values = vec.values() +``` + +Get a list + +```python +lst = vec.to_list() +``` + +Get a NumPy array + +```python +arr = vec.to_numpy() +``` + +Get a SciPy sparse array + +```python +arr = vec.to_coo() +``` + ## History View the [changelog](https://github.com/pgvector/pgvector-python/blob/master/CHANGELOG.md) From ac1a543ab33a09efa2758f0179cea6a89257b601 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:07:29 -0700 Subject: [PATCH 112/123] Improved validation for Bit constructor --- pgvector/bit.py | 4 +++- tests/test_bit.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 4be7385..9a890a1 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -13,7 +13,9 @@ def __init__(self, value): elif value.dtype != np.bool: raise ValueError('expected dtype to be bool or uint8') else: - value = np.asarray(value, dtype=bool) + value = np.asarray(value) + if value.dtype != np.bool: + raise ValueError('expected dtype to be bool') if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 5e1bff2..0c661d0 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -7,6 +7,11 @@ class TestBit: def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] + def test_list_int(self): + with pytest.raises(ValueError) as error: + Bit([254, 7, 0]) + assert str(error.value) == 'expected dtype to be bool' + def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] From 900cbb38370eebfeebdd519482cfd1a30cf6e937 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:14:46 -0700 Subject: [PATCH 113/123] Improved error message --- pgvector/bit.py | 2 +- tests/test_bit.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 9a890a1..a8feb55 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,7 @@ def __init__(self, value): else: value = np.asarray(value) if value.dtype != np.bool: - raise ValueError('expected dtype to be bool') + raise ValueError('expected all elements to be boolean') if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 0c661d0..ae27359 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -7,10 +7,15 @@ class TestBit: def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] + def test_list_none(self): + with pytest.raises(ValueError) as error: + Bit([True, None, True]) + assert str(error.value) == 'expected all elements to be boolean' + def test_list_int(self): with pytest.raises(ValueError) as error: Bit([254, 7, 0]) - assert str(error.value) == 'expected dtype to be bool' + assert str(error.value) == 'expected all elements to be boolean' def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] From 534ec18683d4c5e3058ba14d7810d0d5df7d8c55 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:34:18 -0700 Subject: [PATCH 114/123] Added support for bytes to Bit constructor --- CHANGELOG.md | 1 + pgvector/bit.py | 2 ++ tests/test_bit.py | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebc165a..89e955a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Added top-level `pgvector` package - Added support for pg8000 +- Added support for `bytes` to `Bit` constructor - Changed `globally` option to default to `False` for Psycopg 2 - Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes diff --git a/pgvector/bit.py b/pgvector/bit.py index a8feb55..8766f65 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -6,6 +6,8 @@ class Bit: def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value + elif isinstance(value, bytes): + self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) else: if isinstance(value, np.ndarray): if value.dtype == np.uint8: diff --git a/tests/test_bit.py b/tests/test_bit.py index ae27359..571205f 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -23,6 +23,10 @@ def test_tuple(self): def test_str(self): assert Bit('101').to_list() == [True, False, True] + def test_bytes(self): + assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] + assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) assert Bit(arr).to_text() == '111111100000011100000000' From 2d1b754773f8c4f41970b3f61b93b20460961f98 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:54:26 -0700 Subject: [PATCH 115/123] Restored backwards compatibility of Bit constructor --- pgvector/bit.py | 15 ++++++--------- tests/test_bit.py | 18 ++++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 8766f65..935f0f0 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -1,5 +1,6 @@ import numpy as np from struct import pack, unpack_from +from warnings import warn class Bit: @@ -9,15 +10,11 @@ def __init__(self, value): elif isinstance(value, bytes): self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) else: - if isinstance(value, np.ndarray): - if value.dtype == np.uint8: - value = np.unpackbits(value).astype(bool) - elif value.dtype != np.bool: - raise ValueError('expected dtype to be bool or uint8') - else: - value = np.asarray(value) - if value.dtype != np.bool: - raise ValueError('expected all elements to be boolean') + value = np.asarray(value) + + if value.dtype != np.bool: + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 571205f..a13f476 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,14 +8,12 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.raises(ValueError) as error: - Bit([True, None, True]) - assert str(error.value) == 'expected all elements to be boolean' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.raises(ValueError) as error: - Bit([254, 7, 0]) - assert str(error.value) == 'expected all elements to be boolean' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] @@ -29,13 +27,13 @@ def test_bytes(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - assert Bit(arr).to_text() == '111111100000011100000000' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.raises(ValueError) as error: - Bit(arr) - assert str(error.value) == 'expected dtype to be bool or uint8' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' def test_ndarray_same_object(self): arr = np.array([True, False, True]) From 2ce3f43e6693fec29e92fa84f7d46fefb96f98f0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 15:35:09 -0700 Subject: [PATCH 116/123] Improved internal representation of Bit class --- pgvector/bit.py | 47 +++++++++++++++++++++++++++-------------------- tests/test_bit.py | 10 +++++----- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 935f0f0..72b8052 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -5,51 +5,58 @@ class Bit: def __init__(self, value): - if isinstance(value, str): - self._value = self.from_text(value)._value - elif isinstance(value, bytes): - self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) + if isinstance(value, bytes): + self._len = 8 * len(value) + self._data = value else: - value = np.asarray(value) + if isinstance(value, str): + value = [v != '0' for v in value] + else: + value = np.asarray(value) - if value.dtype != np.bool: - warn('expected elements to be boolean', stacklevel=2) - value = value.astype(bool) + if value.dtype != np.bool: + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) - if value.ndim != 1: - raise ValueError('expected ndim to be 1') + if value.ndim != 1: + raise ValueError('expected ndim to be 1') - self._value = value + self._len = len(value) + self._data = np.packbits(value).tobytes() def __repr__(self): return f'Bit({self.to_text()})' def __eq__(self, other): if isinstance(other, self.__class__): - return np.array_equal(self.to_numpy(), other.to_numpy()) + return self._len == other._len and self._data == other._data return False def to_list(self): - return self._value.tolist() + return self.to_numpy().tolist() def to_numpy(self): - return self._value + return np.unpackbits(np.frombuffer(self._data, dtype=np.uint8), count=self._len).astype(bool) def to_text(self): - return ''.join(self._value.astype(np.uint8).astype(str)) + return ''.join(format(v, '08b') for v in self._data)[:self._len] def to_binary(self): - return pack('>i', len(self._value)) + np.packbits(self._value).tobytes() + return pack('>i', self._len) + self._data @classmethod def from_text(cls, value): - return cls(np.asarray([v != '0' for v in value], dtype=bool)) + return cls(str(value)) @classmethod def from_binary(cls, value): - count = unpack_from('>i', value)[0] - buf = np.frombuffer(value, dtype=np.uint8, offset=4) - return cls(np.unpackbits(buf, count=count).astype(bool)) + if not isinstance(value, bytes): + raise ValueError('expected bytes') + + bit = cls.__new__(cls) + bit._len = unpack_from('>i', value)[0] + bit._data = value[4:] + return bit @classmethod def _to_db(cls, value): diff --git a/tests/test_bit.py b/tests/test_bit.py index a13f476..cf1275e 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -25,6 +25,11 @@ def test_bytes(self): assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + def test_ndarray(self): + arr = np.array([True, False, True]) + assert Bit(arr).to_list() == [True, False, True] + assert np.array_equal(Bit(arr).to_numpy(), arr) + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) with pytest.warns(UserWarning, match='expected elements to be boolean'): @@ -35,11 +40,6 @@ def test_ndarray_uint16(self): with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' - def test_ndarray_same_object(self): - arr = np.array([True, False, True]) - assert Bit(arr).to_list() == [True, False, True] - assert Bit(arr).to_numpy() is arr - def test_ndim_two(self): with pytest.raises(ValueError) as error: Bit([[True, False], [True, False]]) From c2c17c2ab6365e55677bde47d1d13c63b4e87642 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:02:46 -0700 Subject: [PATCH 117/123] Removed warning for result of np.unpackbits --- pgvector/bit.py | 4 +++- tests/test_bit.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 72b8052..edfaec6 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,9 @@ def __init__(self, value): value = np.asarray(value) if value.dtype != np.bool: - warn('expected elements to be boolean', stacklevel=2) + # allow result of np.unpackbits + if value.dtype != np.uint8 or np.any(value > 1): + warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index cf1275e..ef049c7 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -30,6 +30,10 @@ def test_ndarray(self): assert Bit(arr).to_list() == [True, False, True] assert np.array_equal(Bit(arr).to_numpy(), arr) + def test_ndarray_unpackbits(self): + arr = np.unpackbits(np.array([254, 7, 0], dtype=np.uint8)) + assert Bit(arr).to_text() == '111111100000011100000000' + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) with pytest.warns(UserWarning, match='expected elements to be boolean'): From 50fac76f7959a155444e46d9e11be42403b09b26 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:04:10 -0700 Subject: [PATCH 118/123] Improved test --- tests/test_bit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_bit.py b/tests/test_bit.py index ef049c7..5a71642 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -22,8 +22,8 @@ def test_str(self): assert Bit('101').to_list() == [True, False, True] def test_bytes(self): - assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] - assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + assert Bit(b'\xff\x00\xf0').to_text() == '111111110000000011110000' + assert Bit(b'\xfe\x07\x00').to_text() == '111111100000011100000000' def test_ndarray(self): arr = np.array([True, False, True]) From 92bb02a531fc012369ee20f065028aec230d5dcf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:05:17 -0700 Subject: [PATCH 119/123] Updated comment [skip ci] --- pgvector/bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index edfaec6..26a9d8d 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,7 @@ def __init__(self, value): value = np.asarray(value) if value.dtype != np.bool: - # allow result of np.unpackbits + # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) From 4e22f9b26545f1b871cfba0fde21812ebc88ca84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:16:01 -0700 Subject: [PATCH 120/123] Updated warning message --- pgvector/bit.py | 2 +- tests/test_bit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 26a9d8d..e82b325 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -17,7 +17,7 @@ def __init__(self, value): if value.dtype != np.bool: # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): - warn('expected elements to be boolean', stacklevel=2) + warn('elements should be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index 5a71642..e920228 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,11 +8,11 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): @@ -36,12 +36,12 @@ def test_ndarray_unpackbits(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit(arr).to_text() == '110' def test_ndim_two(self): From 7a2dd806e79ad82960cc1a89159ca61f9a12a373 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:20:17 -0700 Subject: [PATCH 121/123] Revert "Updated warning message" This reverts commit 4e22f9b26545f1b871cfba0fde21812ebc88ca84. --- pgvector/bit.py | 2 +- tests/test_bit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index e82b325..26a9d8d 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -17,7 +17,7 @@ def __init__(self, value): if value.dtype != np.bool: # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): - warn('elements should be boolean', stacklevel=2) + warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index e920228..5a71642 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,11 +8,11 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): @@ -36,12 +36,12 @@ def test_ndarray_unpackbits(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' def test_ndim_two(self): From 6bb6df8cce6d5b03e1a8a9b683ae37faaf12db7a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:35:04 -0700 Subject: [PATCH 122/123] Removed unreleased import --- pgvector/psycopg2/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index b40c673..33e5124 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,11 +1,10 @@ from .register import register_vector # TODO remove -from .. import HalfVector, SparseVector, Vector +from .. import HalfVector, SparseVector __all__ = [ 'register_vector', - 'Vector', 'HalfVector', 'SparseVector' ] From a8f2a5f8428ae10d79be53c0367fc007eca4ab78 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 17:53:02 -0700 Subject: [PATCH 123/123] Version bump to 0.4.0 [skip ci] --- CHANGELOG.md | 2 +- README.md | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89e955a..d0e2730 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.0 (unreleased) +## 0.4.0 (2025-03-15) - Added top-level `pgvector` package - Added support for pg8000 diff --git a/README.md b/README.md index 299753e..b6bc055 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ And follow the instructions for your database library: - [Psycopg 3](#psycopg-3) - [Psycopg 2](#psycopg-2) - [asyncpg](#asyncpg) -- [pg8000](#pg8000) [unreleased] +- [pg8000](#pg8000) - [Peewee](#peewee) Or check out some examples: diff --git a/pyproject.toml b/pyproject.toml index 0f291f5..b889f4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.3.6" +version = "0.4.0" description = "pgvector support for Python" readme = "README.md" authors = [