From 0c88c2152918ed4d7abb83d93694ac966074a15b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 1 Nov 2023 13:18:06 -0700 Subject: [PATCH 001/424] Added Reciprocal Rank Fusion example to readme [skip ci] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 37030e0..5362a1d 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ Or check out some examples: - [Embeddings](examples/openai_embeddings.py) with OpenAI - [Sentence embeddings](examples/sentence_embeddings.py) with SentenceTransformers -- [Hybrid search](examples/hybrid_search.py) with SentenceTransformers +- [Hybrid search](examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) +- [Hybrid search](examples/hybrid_search.py) with SentenceTransformers (cross-encoder) - [Image search](examples/pytorch_image_search.py) with PyTorch - [Implicit feedback recommendations](examples/implicit_recs.py) with Implicit - [Explicit feedback recommendations](examples/surprise_recs.py) with Surprise From 416b8456a19b7625ea80cf768083ccbc70ff7fb2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 1 Nov 2023 13:19:58 -0700 Subject: [PATCH 002/424] Use absolute urls for examples for PyPI [skip ci] --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5362a1d..1540410 100644 --- a/README.md +++ b/README.md @@ -26,14 +26,14 @@ And follow the instructions for your database library: Or check out some examples: -- [Embeddings](examples/openai_embeddings.py) with OpenAI -- [Sentence embeddings](examples/sentence_embeddings.py) with SentenceTransformers -- [Hybrid search](examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) -- [Hybrid search](examples/hybrid_search.py) with SentenceTransformers (cross-encoder) -- [Image search](examples/pytorch_image_search.py) with PyTorch -- [Implicit feedback recommendations](examples/implicit_recs.py) with Implicit -- [Explicit feedback recommendations](examples/surprise_recs.py) with Surprise -- [Recommendations](examples/lightfm_recs.py) with LightFM +- [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai_embeddings.py) with OpenAI +- [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_embeddings.py) with SentenceTransformers +- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) +- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) +- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch +- [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit +- [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise +- [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM ## Django From 7997a4b2b98f318fb289902eb64e922b770c25ce Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 18 Nov 2023 11:47:03 -0800 Subject: [PATCH 003/424] Updated OpenAI example [skip ci] --- examples/openai_embeddings.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/openai_embeddings.py b/examples/openai_embeddings.py index 08926f0..ec766d5 100644 --- a/examples/openai_embeddings.py +++ b/examples/openai_embeddings.py @@ -1,4 +1,4 @@ -import openai +from openai import OpenAI from pgvector.psycopg import register_vector import psycopg @@ -16,8 +16,9 @@ 'The bear is growling' ] -response = openai.Embedding.create(input=input, model='text-embedding-ada-002') -embeddings = [v['embedding'] for v in response['data']] +client = OpenAI() +response = client.embeddings.create(input=input, model='text-embedding-ada-002') +embeddings = [v.embedding for v in response.data] for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) From bab9e4b2319329e61caa203fd08bff5f42e7e7f6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 21 Nov 2023 09:22:51 -0800 Subject: [PATCH 004/424] Improved test --- tests/test_django.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_django.py b/tests/test_django.py index 5b9c386..b68b5f2 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -61,7 +61,11 @@ class Migration(migrations.Migration): ), migrations.AddIndex( model_name='item', - index=pgvector.django.IvfflatIndex(fields=['embedding'], lists=1, name='my_index', opclasses=['vector_l2_ops']), + index=pgvector.django.IvfflatIndex(fields=['embedding'], lists=1, name='ivfflat_idx', opclasses=['vector_l2_ops']), + ), + migrations.AddIndex( + model_name='item', + index=pgvector.django.HnswIndex(fields=['embedding'], m=16, ef_construction=64, name='hnsw_idx', opclasses=['vector_l2_ops']), ) ] From 0705e3abbfe248f3ef288078405328cc3f62ad5d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 21 Nov 2023 10:44:08 -0800 Subject: [PATCH 005/424] Added more tests for aggregates with SQLAlchemy - #44 --- tests/test_sqlalchemy.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 21b46ac..c837840 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -166,6 +166,15 @@ def test_avg(self): avg = session.query(func.avg(Item.embedding)).first()[0] assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + def test_avg_orm(self): + with Session(engine) as session: + avg = session.scalars(select(func.avg(Item.embedding))).first() + assert avg is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + avg = session.scalars(select(func.avg(Item.embedding))).first() + assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + def test_sum(self): with Session(engine) as session: sum = session.query(func.sum(Item.embedding)).first()[0] @@ -175,6 +184,15 @@ def test_sum(self): sum = session.query(func.sum(Item.embedding)).first()[0] assert np.array_equal(sum, np.array([5, 7, 9])) + def test_sum_orm(self): + with Session(engine) as session: + sum = session.scalars(select(func.sum(Item.embedding))).first() + assert sum is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + sum = session.scalars(select(func.sum(Item.embedding))).first() + assert np.array_equal(sum, np.array([5, 7, 9])) + def test_bad_dimensions(self): item = Item(embedding=[1, 2]) session = Session(engine) From a03a9a6e61f3bf65f82b6ea94d493edaa338aadc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 21 Nov 2023 10:47:17 -0800 Subject: [PATCH 006/424] Updated SQLAlchemy example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1540410..173c227 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ Average vectors ```python from sqlalchemy.sql import func -session.query(func.avg(Item.embedding)).first()[0] +session.scalars(select(func.avg(Item.embedding))).first() ``` Also supports `sum` From 7ab418e705f3dbd4c58c05f35c02b10b49568fbb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 21 Nov 2023 11:03:51 -0800 Subject: [PATCH 007/424] Added test for asyncio with SQLAlchemy - #44 --- requirements.txt | 2 +- tests/test_sqlalchemy.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e111e7d..b4ce55e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ psycopg[binary] psycopg2-binary pytest pytest-asyncio -SQLAlchemy +SQLAlchemy[asyncio] diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index c837840..9808f0e 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -3,6 +3,7 @@ import pytest from sqlalchemy import create_engine, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError +from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.orm import declarative_base, mapped_column, Session from sqlalchemy.sql import func @@ -213,3 +214,17 @@ def test_bad_dtype(self): session.add(item) with pytest.raises(StatementError, match='dtype must be numeric'): session.commit() + + @pytest.mark.asyncio + async def test_async(self): + engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + avg = await session.scalars(select(func.avg(Item.embedding))) + assert avg.first() == '[2.5,3.5,4.5]' + + await engine.dispose() From 4e02d6f7d62eba2e8febf731693accbf82925704 Mon Sep 17 00:00:00 2001 From: KellyRousselHoomano <86828998+KellyRousselHoomano@users.noreply.github.com> Date: Wed, 22 Nov 2023 19:16:26 +0100 Subject: [PATCH 008/424] Registering Vector Type with SQLAlchemy Registries (#45) --- pgvector/sqlalchemy/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 1cf5b66..ec2d85a 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,6 +1,6 @@ from sqlalchemy.types import UserDefinedType, Float from ..utils import from_db, to_db - +from sqlalchemy.dialects import postgresql __all__ = ['Vector'] @@ -35,3 +35,6 @@ def max_inner_product(self, other): def cosine_distance(self, other): return self.op('<=>', return_type=Float)(other) + +# Register Vector type to PostgreSQL's reflection subsystem +postgresql.base.ischema_names['vector'] = Vector \ No newline at end of file From 478d0ab4bb64c3e8b3795f0af812a758736af5d7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:18:40 -0800 Subject: [PATCH 009/424] Fixed style lint [skip ci] --- pgvector/sqlalchemy/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index ec2d85a..775518e 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,6 +1,7 @@ +from sqlalchemy.dialects import postgresql from sqlalchemy.types import UserDefinedType, Float from ..utils import from_db, to_db -from sqlalchemy.dialects import postgresql + __all__ = ['Vector'] @@ -36,5 +37,6 @@ def max_inner_product(self, other): def cosine_distance(self, other): return self.op('<=>', return_type=Float)(other) + # Register Vector type to PostgreSQL's reflection subsystem -postgresql.base.ischema_names['vector'] = Vector \ No newline at end of file +postgresql.base.ischema_names['vector'] = Vector From dd793e8f91006c60a39e1e9954cda13675bd110b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:19:25 -0800 Subject: [PATCH 010/424] Added test for reflection with SQLAlchemy --- tests/test_sqlalchemy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 9808f0e..32c341c 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.sqlalchemy import Vector import pytest -from sqlalchemy import create_engine, select, text, MetaData, Table, Column, Index, Integer +from sqlalchemy import create_engine, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.orm import declarative_base, mapped_column, Session @@ -215,6 +215,10 @@ def test_bad_dtype(self): with pytest.raises(StatementError, match='dtype must be numeric'): session.commit() + def test_inspect(self): + columns = inspect(engine).get_columns('orm_item') + assert isinstance(columns[1]['type'], Vector) + @pytest.mark.asyncio async def test_async(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') From 323e525911c6fb776602c8ca903b4d3e36f77bdf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:25:17 -0800 Subject: [PATCH 011/424] Updated changelog [skip ci] --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8dc947..96ac6a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.4 (unreleased) + +- Improved reflection with SQLAlchemy + ## 0.2.3 (2023-09-25) - Fixed null values with Django From ef5de46dcb7b2bc4c3deb5e6d2eca0cadeb9e88c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:28:04 -0800 Subject: [PATCH 012/424] Updated CI --- .github/workflows/build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d711cd5..768cc3c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,9 +6,9 @@ jobs: strategy: fail-fast: false matrix: - python: [3.11, 3.8] + python: [3.12, 3.8] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.5.0 https://github.com/pgvector/pgvector.git + git clone --branch v0.5.1 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install From 30d29e7da71de5035f4f0fbdabfec88dcb96fdf8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:41:36 -0800 Subject: [PATCH 013/424] Added tests for SQLModel --- requirements.txt | 3 +- tests/test_sqlmodel.py | 89 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 tests/test_sqlmodel.py diff --git a/requirements.txt b/requirements.txt index b4ce55e..da0ef44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ psycopg[binary] psycopg2-binary pytest pytest-asyncio -SQLAlchemy[asyncio] +SQLAlchemy[asyncio]>=2 +sqlmodel>=0.0.12 diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py new file mode 100644 index 0000000..c80817e --- /dev/null +++ b/tests/test_sqlmodel.py @@ -0,0 +1,89 @@ +import numpy as np +from pgvector.sqlalchemy import Vector +import pytest +from sqlalchemy import Column +from sqlalchemy.exc import StatementError +from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text +from typing import List, Optional + +engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +with Session(engine) as session: + session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) + + +class Item(SQLModel, table=True): + __tablename__ = 'sqlmodel_item' + + id: Optional[int] = Field(default=None, primary_key=True) + embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Vector(3))) + + +SQLModel.metadata.drop_all(engine) +SQLModel.metadata.create_all(engine) + + +def create_items(): + vectors = [ + [1, 1, 1], + [2, 2, 2], + [1, 1, 2] + ] + session = Session(engine) + for i, v in enumerate(vectors): + session.add(Item(id=i + 1, embedding=v)) + session.commit() + + +class TestSqlmodel: + def setup_method(self, test_method): + with Session(engine) as session: + session.exec(delete(Item)) + session.commit() + + def test_orm(self): + item = Item(embedding=[1.5, 2, 3]) + item2 = Item(embedding=[4, 5, 6]) + item3 = Item() + + session = Session(engine) + session.add(item) + session.add(item2) + session.add(item3) + session.commit() + + stmt = select(Item) + with Session(engine) as session: + items = session.exec(stmt).all() + assert items[0].id == 1 + assert items[1].id == 2 + assert items[2].id == 3 + assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) + assert items[0].embedding.dtype == np.float32 + assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) + assert items[1].embedding.dtype == np.float32 + assert items[2].embedding is None + + def test_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_bad_dimensions(self): + item = Item(embedding=[1, 2]) + session = Session(engine) + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() From 696e81bc12c9f183ce0c4a898d84d08014003df3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:44:42 -0800 Subject: [PATCH 014/424] Updated example for latest SQLModel [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 173c227..4937fde 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distanc Enable the extension ```python -session.exec('CREATE EXTENSION IF NOT EXISTS vector') +session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) ``` Add a vector column From 38dea36c1a34496b2657b366da27b02b0f3e0c4a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 10:54:59 -0800 Subject: [PATCH 015/424] Improved docs [skip ci] --- README.md | 18 ++++++++++++++++++ tests/test_asyncpg.py | 4 ++-- tests/test_psycopg.py | 4 ++-- tests/test_psycopg2.py | 2 +- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4937fde..637e7a1 100644 --- a/README.md +++ b/README.md @@ -256,6 +256,12 @@ from pgvector.psycopg import register_vector_async await register_vector_async(conn) ``` +Create a table + +```python +conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +``` + Insert a vector ```python @@ -286,6 +292,12 @@ from pgvector.psycopg2 import register_vector register_vector(conn) ``` +Create a table + +```python +cur.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +``` + Insert a vector ```python @@ -325,6 +337,12 @@ async def init(conn): pool = await asyncpg.create_pool(..., init=init) ``` +Create a table + +```python +await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +``` + Insert a vector ```python diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 2634310..33b8f96 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -11,7 +11,7 @@ async def test_works(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') + await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector(conn) @@ -41,7 +41,7 @@ async def init(conn): async with pool.acquire() as conn: await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') + await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') embedding = np.array([1.5, 2, 3]) await conn.execute("INSERT INTO item (embedding) VALUES ($1), (NULL)", embedding) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 73c54d9..3e9ced9 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -7,7 +7,7 @@ conn.execute('CREATE EXTENSION IF NOT EXISTS vector') conn.execute('DROP TABLE IF EXISTS item') -conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') +conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') register_vector(conn) @@ -77,7 +77,7 @@ async def test_async(self): await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') + await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector_async(conn) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index ac899ed..94ffba3 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -8,7 +8,7 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') cur.execute('DROP TABLE IF EXISTS item') -cur.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') +cur.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') register_vector(cur) From 96862702c838c63e700f70632c9366a376b930f3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 Nov 2023 11:00:58 -0800 Subject: [PATCH 016/424] Updated table names [skip ci] --- README.md | 18 +++++++++--------- examples/pytorch_image_search.py | 8 ++++---- tests/test_asyncpg.py | 18 +++++++++--------- tests/test_psycopg.py | 24 ++++++++++++------------ tests/test_psycopg2.py | 10 +++++----- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 637e7a1..2b5f8b9 100644 --- a/README.md +++ b/README.md @@ -259,20 +259,20 @@ await register_vector_async(conn) Create a table ```python -conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') ``` Insert a vector ```python embedding = np.array([1, 2, 3]) -conn.execute('INSERT INTO item (embedding) VALUES (%s)', (embedding,)) +conn.execute('INSERT INTO items (embedding) VALUES (%s)', (embedding,)) ``` Get the nearest neighbors to a vector ```python -conn.execute('SELECT * FROM item ORDER BY embedding <-> %s LIMIT 5', (embedding,)).fetchall() +conn.execute('SELECT * FROM items ORDER BY embedding <-> %s LIMIT 5', (embedding,)).fetchall() ``` ## Psycopg 2 @@ -295,20 +295,20 @@ register_vector(conn) Create a table ```python -cur.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +cur.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') ``` Insert a vector ```python embedding = np.array([1, 2, 3]) -cur.execute('INSERT INTO item (embedding) VALUES (%s)', (embedding,)) +cur.execute('INSERT INTO items (embedding) VALUES (%s)', (embedding,)) ``` Get the nearest neighbors to a vector ```python -cur.execute('SELECT * FROM item ORDER BY embedding <-> %s LIMIT 5', (embedding,)) +cur.execute('SELECT * FROM items ORDER BY embedding <-> %s LIMIT 5', (embedding,)) cur.fetchall() ``` @@ -340,20 +340,20 @@ pool = await asyncpg.create_pool(..., init=init) Create a table ```python -await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') ``` Insert a vector ```python embedding = np.array([1, 2, 3]) -await conn.execute('INSERT INTO item (embedding) VALUES ($1)', embedding) +await conn.execute('INSERT INTO items (embedding) VALUES ($1)', embedding) ``` Get the nearest neighbors to a vector ```python -await conn.fetch('SELECT * FROM item ORDER BY embedding <-> $1 LIMIT 5', embedding) +await conn.fetch('SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5', embedding) ``` ## Peewee diff --git a/examples/pytorch_image_search.py b/examples/pytorch_image_search.py index 0cc9af0..e05571c 100644 --- a/examples/pytorch_image_search.py +++ b/examples/pytorch_image_search.py @@ -38,14 +38,14 @@ def generate_embeddings(inputs): # generate, save, and index embeddings if seed: - conn.execute('DROP TABLE IF EXISTS image') - conn.execute('CREATE TABLE image (id bigserial PRIMARY KEY, embedding vector(512))') + conn.execute('DROP TABLE IF EXISTS images') + conn.execute('CREATE TABLE images (id bigserial PRIMARY KEY, embedding vector(512))') print('Generating embeddings') for data in tqdm(dataloader): embeddings = generate_embeddings(data[0]) - sql = 'INSERT INTO image (embedding) VALUES ' + ','.join(['(%s)' for _ in embeddings]) + sql = 'INSERT INTO images (embedding) VALUES ' + ','.join(['(%s)' for _ in embeddings]) params = [embedding for embedding in embeddings] conn.execute(sql, params) @@ -67,5 +67,5 @@ def show_images(dataset_images): # generate and query embeddings embeddings = generate_embeddings(images) for image, embedding in zip(images, embeddings): - result = conn.execute('SELECT id FROM image ORDER BY embedding <=> %s LIMIT 15', (embedding,)).fetchall() + result = conn.execute('SELECT id FROM images ORDER BY embedding <=> %s LIMIT 15', (embedding,)).fetchall() show_images([image] + [dataset[row[0] - 1][0] for row in result]) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 33b8f96..f06ac3d 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -10,15 +10,15 @@ class TestAsyncpg: async def test_works(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS items') + await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector(conn) embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO item (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM item ORDER BY id") + res = await conn.fetch("SELECT * FROM items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert np.array_equal(res[0]['embedding'], embedding) @@ -26,7 +26,7 @@ async def test_works(self): assert res[1]['embedding'] is None # ensures binary format is correct - text_res = await conn.fetch("SELECT embedding::text FROM item ORDER BY id LIMIT 1") + text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") assert text_res[0]['embedding'] == '[1.5,2,3]' await conn.close() @@ -40,13 +40,13 @@ async def init(conn): async with pool.acquire() as conn: await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS items') + await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO item (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM item ORDER BY id") + res = await conn.fetch("SELECT * FROM items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert np.array_equal(res[0]['embedding'], embedding) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 3e9ced9..926bf45 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -6,21 +6,21 @@ conn = psycopg.connect(dbname='pgvector_python_test', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') -conn.execute('DROP TABLE IF EXISTS item') -conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +conn.execute('DROP TABLE IF EXISTS items') +conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') register_vector(conn) class TestPsycopg: def setup_method(self, test_method): - conn.execute('DELETE FROM item') + conn.execute('DELETE FROM items') def test_works(self): embedding = np.array([1.5, 2, 3]) - conn.execute('INSERT INTO item (embedding) VALUES (%s), (NULL)', (embedding,)) + conn.execute('INSERT INTO items (embedding) VALUES (%s), (NULL)', (embedding,)) - res = conn.execute('SELECT * FROM item ORDER BY id').fetchall() + res = conn.execute('SELECT * FROM items ORDER BY id').fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 assert res[1][1] is None @@ -55,19 +55,19 @@ def test_binary_format_non_contiguous(self): def test_text_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY item (embedding) FROM STDIN") as copy: + with cur.copy("COPY items (embedding) FROM STDIN") as copy: copy.write_row([embedding]) def test_binary_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY item (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + with cur.copy("COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.write_row([embedding]) def test_binary_copy_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY item (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.set_types(['int8', 'vector']) copy.write_row([1, embedding]) @@ -76,16 +76,16 @@ async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS items') + await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector_async(conn) embedding = np.array([1.5, 2, 3]) - await conn.execute('INSERT INTO item (embedding) VALUES (%s), (NULL)', (embedding,)) + await conn.execute('INSERT INTO items (embedding) VALUES (%s), (NULL)', (embedding,)) async with conn.cursor() as cur: - await cur.execute('SELECT * FROM item ORDER BY id') + await cur.execute('SELECT * FROM items ORDER BY id') res = await cur.fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 94ffba3..925d60f 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -7,21 +7,21 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') -cur.execute('DROP TABLE IF EXISTS item') -cur.execute('CREATE TABLE item (id bigserial PRIMARY KEY, embedding vector(3))') +cur.execute('DROP TABLE IF EXISTS items') +cur.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') register_vector(cur) class TestPsycopg2: def setup_method(self, test_method): - cur.execute('DELETE FROM item') + cur.execute('DELETE FROM items') def test_works(self): embedding = np.array([1.5, 2, 3]) - cur.execute('INSERT INTO item (embedding) VALUES (%s), (NULL)', (embedding,)) + cur.execute('INSERT INTO items (embedding) VALUES (%s), (NULL)', (embedding,)) - cur.execute('SELECT * FROM item ORDER BY id') + cur.execute('SELECT * FROM items ORDER BY id') res = cur.fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 From a8c5d8479a066c6d941ee7277561745c422aa9b3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 24 Nov 2023 09:23:59 -0800 Subject: [PATCH 017/424] Updated import --- pgvector/sqlalchemy/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 775518e..e6c128a 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,4 +1,4 @@ -from sqlalchemy.dialects import postgresql +from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float from ..utils import from_db, to_db @@ -38,5 +38,5 @@ def cosine_distance(self, other): return self.op('<=>', return_type=Float)(other) -# Register Vector type to PostgreSQL's reflection subsystem -postgresql.base.ischema_names['vector'] = Vector +# for reflection +ischema_names['vector'] = Vector From d86ea90da3817a015cdac18264de8e79275b3071 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 24 Nov 2023 09:26:23 -0800 Subject: [PATCH 018/424] Version bump to 0.2.4 [skip ci] --- CHANGELOG.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96ac6a4..69bdcdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.4 (unreleased) +## 0.2.4 (2023-11-24) - Improved reflection with SQLAlchemy diff --git a/setup.py b/setup.py index 227b6cf..4b24a00 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.2.3', + version='0.2.4', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From e9232f9f5e098f670e6f723b078f24ae506688fd Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 29 Nov 2023 21:43:13 -0800 Subject: [PATCH 019/424] Added Citus example [skip ci] --- README.md | 1 + examples/citus.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 examples/citus.py diff --git a/README.md b/README.md index 2b5f8b9..94ff51e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ Or check out some examples: - [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit - [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise - [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM +- [Horizontal scaling](https://github.com/pgvector/pgvector-python/blob/master/examples/citus.py) with Citus ## Django diff --git a/examples/citus.py b/examples/citus.py new file mode 100644 index 0000000..915c25f --- /dev/null +++ b/examples/citus.py @@ -0,0 +1,49 @@ +import numpy as np +from pgvector.psycopg import register_vector +import psycopg + +# generate random data +rows = 100000 +dimensions = 128 +embeddings = np.random.rand(rows, dimensions) +categories = np.random.randint(100, size=rows).tolist() +queries = np.random.rand(10, dimensions) + +# enable extensions +conn = psycopg.connect(dbname='pgvector_citus', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS citus') +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + +# GUC variables set on the session do not propagate to Citus workers +# https://github.com/citusdata/citus/issues/462 +# you can either: +# 1. set them on the system, user, or database and reconnect +# 2. set them for a transaction with SET LOCAL +conn.execute("ALTER DATABASE pgvector_citus SET maintenance_work_mem = '512MB'") +conn.execute('ALTER DATABASE pgvector_citus SET hnsw.ef_search = 20') +conn.close() + +# reconnect for updated GUC variables to take effect +conn = psycopg.connect(dbname='pgvector_citus', autocommit=True) +register_vector(conn) + +print('Creating distributed table') +conn.execute('DROP TABLE IF EXISTS items') +conn.execute('CREATE TABLE items (id bigserial, embedding vector(%d), category_id bigint, PRIMARY KEY (id, category_id))' % dimensions) +conn.execute('SET citus.shard_count = 4') +conn.execute("SELECT create_distributed_table('items', 'category_id')") + +print('Loading data in parallel') +with conn.cursor().copy('COPY items (embedding, category_id) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['vector', 'bigint']) + + for i in range(rows): + copy.write_row([embeddings[i], categories[i]]) + +print('Creating index in parallel') +conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') + +print('Running distributed queries') +for query in queries: + items = conn.execute('SELECT id FROM items ORDER BY embedding <-> %s LIMIT 10', (query,)).fetchall() + print([r[0] for r in items]) From 907f67836708a3533b08a1827cba5a9f72561f54 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 18 Jan 2024 16:39:25 -0800 Subject: [PATCH 020/424] Added literal binds support for SQLAlchemy - closes #51 --- CHANGELOG.md | 4 ++++ pgvector/sqlalchemy/__init__.py | 9 ++++++++- tests/test_sqlalchemy.py | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69bdcdb..cebbf99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.5 (unreleased) + +- Added literal binds support for SQLAlchemy + ## 0.2.4 (2023-11-24) - Improved reflection with SQLAlchemy diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index e6c128a..8b798e4 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,5 +1,5 @@ from sqlalchemy.dialects.postgresql.base import ischema_names -from sqlalchemy.types import UserDefinedType, Float +from sqlalchemy.types import UserDefinedType, Float, String from ..utils import from_db, to_db __all__ = ['Vector'] @@ -7,6 +7,7 @@ class Vector(UserDefinedType): cache_ok = True + _string = String() def __init__(self, dim=None): super(UserDefinedType, self).__init__() @@ -22,6 +23,12 @@ def process(value): return to_db(value, self.dim) return process + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + def process(value): + return string_literal_processor(to_db(value, self.dim)) + return process + def result_processor(self, dialect, coltype): def process(value): return from_db(value) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 32c341c..0c4681e 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -219,6 +219,10 @@ def test_inspect(self): columns = inspect(engine).get_columns('orm_item') assert isinstance(columns[1]['type'], Vector) + def test_literal_binds(self): + sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(compile_kwargs={'literal_binds': True}) + assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) + @pytest.mark.asyncio async def test_async(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') From aa7214eb9a138c547c35d27423dcbaf5c887d8a8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 18 Jan 2024 16:49:07 -0800 Subject: [PATCH 021/424] Fixed CI --- pgvector/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pgvector/__init__.py diff --git a/pgvector/__init__.py b/pgvector/__init__.py new file mode 100644 index 0000000..e69de29 From c6fb53b1db9e9f275c380009741cb7fa1724e772 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 18 Jan 2024 18:58:13 -0800 Subject: [PATCH 022/424] Improved test --- tests/test_sqlalchemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0c4681e..0f40e43 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -220,7 +220,7 @@ def test_inspect(self): assert isinstance(columns[1]['type'], Vector) def test_literal_binds(self): - sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(compile_kwargs={'literal_binds': True}) + sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(engine, compile_kwargs={'literal_binds': True}) assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) @pytest.mark.asyncio From 2bb8c3c2dbe4bb6620db246cfab1b7f8f4ddbb43 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 25 Jan 2024 18:22:06 -0800 Subject: [PATCH 023/424] Updated OpenAI model [skip ci] --- examples/openai_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/openai_embeddings.py b/examples/openai_embeddings.py index ec766d5..ebed3d0 100644 --- a/examples/openai_embeddings.py +++ b/examples/openai_embeddings.py @@ -17,7 +17,7 @@ ] client = OpenAI() -response = client.embeddings.create(input=input, model='text-embedding-ada-002') +response = client.embeddings.create(input=input, model='text-embedding-3-small') embeddings = [v.embedding for v in response.data] for content, embedding in zip(input, embeddings): From d5f764f1798fe4b4708c05224f2eb2bd14ae5a6a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 7 Feb 2024 11:34:21 -0800 Subject: [PATCH 024/424] Version bump to 0.2.5 [skip ci] --- CHANGELOG.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cebbf99..5bfc2dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.5 (unreleased) +## 0.2.5 (2024-02-07) - Added literal binds support for SQLAlchemy diff --git a/setup.py b/setup.py index 4b24a00..dff986d 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.2.4', + version='0.2.5', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From bfa5954f3e731894b51d119dc9d23f3f16645870 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 7 Feb 2024 11:35:36 -0800 Subject: [PATCH 025/424] Fixed lint [skip ci] --- pgvector/sqlalchemy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 8b798e4..6220082 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -25,6 +25,7 @@ def process(value): def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) + def process(value): return string_literal_processor(to_db(value, self.dim)) return process From 8730940ea235e16ac30c9cb8f27ed41a47aa0ca5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 17 Feb 2024 10:22:15 -0800 Subject: [PATCH 026/424] Updated badge [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 94ff51e..54f74be 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), and [Peewee](https://github.com/coleifer/peewee) -[![Build Status](https://github.com/pgvector/pgvector-python/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector-python/actions) +[![Build Status](https://github.com/pgvector/pgvector-python/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-python/actions) ## Installation From ee12393e9e18910bc6f9115053ce11efdc229134 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 17 Feb 2024 10:31:45 -0800 Subject: [PATCH 027/424] Updated pgvector on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 768cc3c..c39241c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.5.1 https://github.com/pgvector/pgvector.git + git clone --branch v0.6.0 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install From 913fbb0dcb1e3e98f0b748cc0ad09c8ddd7fac05 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 18 Feb 2024 12:07:13 -0800 Subject: [PATCH 028/424] Updated readme [skip ci] --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 54f74be..f8b0d9b 100644 --- a/README.md +++ b/README.md @@ -100,23 +100,23 @@ Also supports `Sum` Add an approximate index ```python -from pgvector.django import IvfflatIndex, HnswIndex +from pgvector.django import HnswIndex, IvfflatIndex class Item(models.Model): class Meta: indexes = [ - IvfflatIndex( + HnswIndex( name='my_index', fields=['embedding'], - lists=100, + m=16, + ef_construction=64, opclasses=['vector_l2_ops'] ), # or - HnswIndex( + IvfflatIndex( name='my_index', fields=['embedding'], - m=16, - ef_construction=64, + lists=100, opclasses=['vector_l2_ops'] ) ] @@ -183,14 +183,14 @@ Add an approximate index ```python index = Index('my_index', Item.embedding, - postgresql_using='ivfflat', - postgresql_with={'lists': 100}, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_l2_ops'} ) # or index = Index('my_index', Item.embedding, - postgresql_using='hnsw', - postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_using='ivfflat', + postgresql_with={'lists': 100}, postgresql_ops={'embedding': 'vector_l2_ops'} ) From 2836dd3d12f07869fbc9655d9751a6910be745c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 22 Feb 2024 20:09:38 -0800 Subject: [PATCH 029/424] Added more SQLModel examples and tests --- README.md | 22 ++++++++++++++++++++++ tests/test_sqlmodel.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/README.md b/README.md index f8b0d9b..475fb70 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,28 @@ session.exec(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit( Also supports `max_inner_product` and `cosine_distance` +Get the distance + +```python +session.exec(select(Item.embedding.l2_distance([3, 1, 2]))) +``` + +Get items within a certain distance + +```python +session.exec(select(Item).filter(Item.embedding.l2_distance([3, 1, 2]) < 5)) +``` + +Average vectors + +```python +from sqlalchemy.sql import func + +session.exec(select(func.avg(Item.embedding))).first() +``` + +Also supports `sum` + ## Psycopg 3 Enable the extension diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index c80817e..9dccedc 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -3,6 +3,7 @@ import pytest from sqlalchemy import Column from sqlalchemy.exc import StatementError +from sqlalchemy.sql import func from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text from typing import List, Optional @@ -81,6 +82,36 @@ def test_cosine_distance(self): items = session.exec(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] + def test_filter(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1)) + assert [v.id for v in items] == [1] + + def test_select(self): + with Session(engine) as session: + session.add(Item(embedding=[2, 3, 3])) + item = session.exec(select(Item.embedding.l2_distance([1, 1, 1]))).all() + assert item[0] == 3 + + def test_avg(self): + with Session(engine) as session: + avg = session.exec(select(func.avg(Item.embedding))).first() + assert avg is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + avg = session.exec(select(func.avg(Item.embedding))).first() + assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + + def test_sum(self): + with Session(engine) as session: + sum = session.exec(select(func.sum(Item.embedding))).first() + assert sum is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + sum = session.exec(select(func.sum(Item.embedding))).first() + assert np.array_equal(sum, np.array([5, 7, 9])) + def test_bad_dimensions(self): item = Item(embedding=[1, 2]) session = Session(engine) From 69e1e92f5f9ae7f6c27a784f271fe0a8e61221b1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 22 Feb 2024 20:12:27 -0800 Subject: [PATCH 030/424] Improved examples in readme [skip ci] --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index 475fb70..0cb6928 100644 --- a/README.md +++ b/README.md @@ -298,6 +298,16 @@ Get the nearest neighbors to a vector conn.execute('SELECT * FROM items ORDER BY embedding <-> %s LIMIT 5', (embedding,)).fetchall() ``` +Add an approximate index + +```python +conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +conn.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Psycopg 2 Enable the extension @@ -335,6 +345,16 @@ cur.execute('SELECT * FROM items ORDER BY embedding <-> %s LIMIT 5', (embedding, cur.fetchall() ``` +Add an approximate index + +```python +cur.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +cur.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## asyncpg Enable the extension @@ -379,6 +399,16 @@ Get the nearest neighbors to a vector await conn.fetch('SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5', embedding) ``` +Add an approximate index + +```python +await conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +await conn.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Peewee Add a vector column From b2dafb6d385bea9c51dea55f7223b3b63bf84bed Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 22 Feb 2024 20:26:21 -0800 Subject: [PATCH 031/424] Improved readme and tests --- README.md | 34 ++++++++++++++++++++++++++++++++-- tests/test_sqlalchemy.py | 9 +++++++++ tests/test_sqlmodel.py | 11 ++++++++++- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0cb6928..2122299 100644 --- a/README.md +++ b/README.md @@ -182,13 +182,17 @@ Also supports `sum` Add an approximate index ```python -index = Index('my_index', Item.embedding, +index = Index( + 'my_index', + Item.embedding, postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_l2_ops'} ) # or -index = Index('my_index', Item.embedding, +index = Index( + 'my_index', + Item.embedding, postgresql_using='ivfflat', postgresql_with={'lists': 100}, postgresql_ops={'embedding': 'vector_l2_ops'} @@ -255,6 +259,32 @@ session.exec(select(func.avg(Item.embedding))).first() Also supports `sum` +Add an approximate index + +```python +from sqlalchemy import Index + +index = Index( + 'my_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +# or +index = Index( + 'my_index', + Item.embedding, + postgresql_using='ivfflat', + postgresql_with={'lists': 100}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) + +index.create(engine) +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Psycopg 3 Enable the extension diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0f40e43..858e58c 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -25,6 +25,15 @@ class Item(Base): Base.metadata.drop_all(engine) Base.metadata.create_all(engine) +index = Index( + 'orm_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +index.create(engine) + def create_items(): vectors = [ diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 9dccedc..4e30d2b 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.sqlalchemy import Vector import pytest -from sqlalchemy import Column +from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError from sqlalchemy.sql import func from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text @@ -22,6 +22,15 @@ class Item(SQLModel, table=True): SQLModel.metadata.drop_all(engine) SQLModel.metadata.create_all(engine) +index = Index( + 'sqlmodel_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +index.create(engine) + def create_items(): vectors = [ From 65263193b10aaa3b10c59ef82fe92c4df4f2d46f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 23 Feb 2024 11:14:40 -0800 Subject: [PATCH 032/424] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 8 ++++---- tests/test_sqlmodel.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 858e58c..441b8fd 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -158,14 +158,14 @@ def test_filter_orm(self): def test_select(self): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) - item = session.query(Item.embedding.l2_distance([1, 1, 1])).first() - assert item[0] == 3 + items = session.query(Item.embedding.l2_distance([1, 1, 1])).first() + assert items[0] == 3 def test_select_orm(self): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) - item = session.scalars(select(Item.embedding.l2_distance([1, 1, 1]))).all() - assert item[0] == 3 + items = session.scalars(select(Item.embedding.l2_distance([1, 1, 1]))).all() + assert items[0] == 3 def test_avg(self): with Session(engine) as session: diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 4e30d2b..71fa801 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -100,8 +100,8 @@ def test_filter(self): def test_select(self): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) - item = session.exec(select(Item.embedding.l2_distance([1, 1, 1]))).all() - assert item[0] == 3 + items = session.exec(select(Item.embedding.l2_distance([1, 1, 1]))).all() + assert items[0] == 3 def test_avg(self): with Session(engine) as session: From 2866df999e558eeb5397214143acf859a1ae2a4f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 29 Feb 2024 21:15:58 -0800 Subject: [PATCH 033/424] Added bulk loading example [skip ci] --- README.md | 1 + examples/bulk_loading.py | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 examples/bulk_loading.py diff --git a/README.md b/README.md index 2122299..8aa0411 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Or check out some examples: - [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise - [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM - [Horizontal scaling](https://github.com/pgvector/pgvector-python/blob/master/examples/citus.py) with Citus +- [Bulk loading](https://github.com/pgvector/pgvector-python/blob/master/examples/bulk_loading.py) with `COPY` ## Django diff --git a/examples/bulk_loading.py b/examples/bulk_loading.py new file mode 100644 index 0000000..8fa1928 --- /dev/null +++ b/examples/bulk_loading.py @@ -0,0 +1,42 @@ +import numpy as np +from pgvector.psycopg import register_vector +import psycopg + +# generate random data +rows = 1000000 +dimensions = 128 +embeddings = np.random.rand(rows, dimensions) + +# enable extension +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +# create table +conn.execute('DROP TABLE IF EXISTS items') +conn.execute(f'CREATE TABLE items (id bigserial, embedding vector({dimensions}))') + +# load data in batches +cur = conn.cursor() +batches = len(embeddings) // 10000 +print(f'Loading {len(embeddings)} rows over {batches} batches') +for batch in np.array_split(embeddings, batches): + # show progress + print('.', end='', flush=True) + + with cur.copy('COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['vector']) + + for embedding in batch: + copy.write_row([embedding]) + +print('\nSuccess!') + +# create any indexes *after* loading initial data (skipping for this example) +# print('Creating index') +# conn.execute("SET maintenance_work_mem = '8GB'") +# conn.execute("SET max_parallel_maintenance_workers = 7") +# conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops)') + +# update planner statistics for good measure +conn.execute('ANALYZE items') From 9f55a60892fa8e9e14d946fc5c26c4f2027f0c07 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 29 Feb 2024 21:27:13 -0800 Subject: [PATCH 034/424] Added comment [skip ci] --- examples/bulk_loading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/bulk_loading.py b/examples/bulk_loading.py index 8fa1928..1fe546b 100644 --- a/examples/bulk_loading.py +++ b/examples/bulk_loading.py @@ -25,6 +25,8 @@ print('.', end='', flush=True) with cur.copy('COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + # use set_types for binary copy + # https://www.psycopg.org/psycopg3/docs/basic/copy.html#binary-copy copy.set_types(['vector']) for embedding in batch: From abe0560564e78e41875150e736007a23326d4fdb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 3 Mar 2024 14:26:39 -0800 Subject: [PATCH 035/424] Improved example [skip ci] --- examples/bulk_loading.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/bulk_loading.py b/examples/bulk_loading.py index 1fe546b..2c2ef44 100644 --- a/examples/bulk_loading.py +++ b/examples/bulk_loading.py @@ -16,11 +16,10 @@ conn.execute('DROP TABLE IF EXISTS items') conn.execute(f'CREATE TABLE items (id bigserial, embedding vector({dimensions}))') -# load data in batches +# load data +print(f'Loading {len(embeddings)} rows') cur = conn.cursor() -batches = len(embeddings) // 10000 -print(f'Loading {len(embeddings)} rows over {batches} batches') -for batch in np.array_split(embeddings, batches): +for batch in np.array_split(embeddings, len(embeddings) // 10000): # show progress print('.', end='', flush=True) From ddffb8068014c5bdcdcbaf2dcbb3107d2dc98241 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 3 Mar 2024 21:18:12 -0800 Subject: [PATCH 036/424] Improved example [skip ci] --- examples/bulk_loading.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/bulk_loading.py b/examples/bulk_loading.py index 2c2ef44..957be4c 100644 --- a/examples/bulk_loading.py +++ b/examples/bulk_loading.py @@ -19,17 +19,21 @@ # load data print(f'Loading {len(embeddings)} rows') cur = conn.cursor() -for batch in np.array_split(embeddings, len(embeddings) // 10000): - # show progress - print('.', end='', flush=True) +with cur.copy('COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + # use set_types for binary copy + # https://www.psycopg.org/psycopg3/docs/basic/copy.html#binary-copy + copy.set_types(['vector']) - with cur.copy('COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: - # use set_types for binary copy - # https://www.psycopg.org/psycopg3/docs/basic/copy.html#binary-copy - copy.set_types(['vector']) + for i, embedding in enumerate(embeddings): + # show progress + if i % 10000 == 0: + print('.', end='', flush=True) - for embedding in batch: - copy.write_row([embedding]) + copy.write_row([embedding]) + + # flush data + while conn.pgconn.flush() == 1: + pass print('\nSuccess!') From da00e9b896f6017af51cd80376cc69b2baf2b098 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Mar 2024 22:43:30 -0800 Subject: [PATCH 037/424] Added more insert tests for SQLAlchemy --- tests/test_sqlalchemy.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 441b8fd..f595e1b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.sqlalchemy import Vector import pytest -from sqlalchemy import create_engine, inspect, select, text, MetaData, Table, Column, Index, Integer +from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.orm import declarative_base, mapped_column, Session @@ -232,6 +232,12 @@ def test_literal_binds(self): sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(engine, compile_kwargs={'literal_binds': True}) assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) + def test_insert(self): + session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) + + def test_insert_bulk(self): + session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) + @pytest.mark.asyncio async def test_async(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') From dd84bc1a7f8afa5910b8660c660fb4add3ad4341 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Mar 2024 22:46:33 -0800 Subject: [PATCH 038/424] Added another insert test for SQLAlchemy --- tests/test_sqlalchemy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f595e1b..b79f835 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -238,6 +238,9 @@ def test_insert(self): def test_insert_bulk(self): session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) + def test_insert_text(self): + session.execute(text('INSERT INTO orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + @pytest.mark.asyncio async def test_async(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') From 1b81759673df531879b5d9081025c0b356acff50 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Mar 2024 15:14:37 -0700 Subject: [PATCH 039/424] Updated example [skip ci] --- examples/bulk_loading.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/bulk_loading.py b/examples/bulk_loading.py index 957be4c..1df8fe7 100644 --- a/examples/bulk_loading.py +++ b/examples/bulk_loading.py @@ -38,10 +38,11 @@ print('\nSuccess!') # create any indexes *after* loading initial data (skipping for this example) -# print('Creating index') -# conn.execute("SET maintenance_work_mem = '8GB'") -# conn.execute("SET max_parallel_maintenance_workers = 7") -# conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops)') +if False: + print('Creating index') + conn.execute("SET maintenance_work_mem = '8GB'") + conn.execute('SET max_parallel_maintenance_workers = 7') + conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops)') # update planner statistics for good measure conn.execute('ANALYZE items') From bc213809d99dd2fa46ef9537a80e2567c5f5c202 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 22 Mar 2024 17:08:08 -0700 Subject: [PATCH 040/424] Improved example [skip ci] --- examples/citus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/citus.py b/examples/citus.py index 915c25f..d448204 100644 --- a/examples/citus.py +++ b/examples/citus.py @@ -40,6 +40,9 @@ for i in range(rows): copy.write_row([embeddings[i], categories[i]]) + while conn.pgconn.flush() == 1: + pass + print('Creating index in parallel') conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') From 8bd28ca5ef9737ed179c6c0d2caeb69be3d4d124 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 24 Mar 2024 10:36:39 -0700 Subject: [PATCH 041/424] Improved example [skip ci] --- examples/pytorch_image_search.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/examples/pytorch_image_search.py b/examples/pytorch_image_search.py index e05571c..b4cf131 100644 --- a/examples/pytorch_image_search.py +++ b/examples/pytorch_image_search.py @@ -8,13 +8,11 @@ seed = True - # establish connection conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') register_vector(conn) - # load images transform = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), @@ -23,7 +21,6 @@ dataset = torchvision.datasets.CIFAR10(root=tempfile.gettempdir(), train=True, download=True, transform=transform) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1000) - # load pretrained model device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') model = torchvision.models.resnet18(weights='DEFAULT') @@ -36,7 +33,7 @@ def generate_embeddings(inputs): return model(inputs.to(device)).detach().cpu().numpy() -# generate, save, and index embeddings +# generate and store embeddings if seed: conn.execute('DROP TABLE IF EXISTS images') conn.execute('CREATE TABLE images (id bigserial PRIMARY KEY, embedding vector(512))') @@ -49,23 +46,24 @@ def generate_embeddings(inputs): params = [embedding for embedding in embeddings] conn.execute(sql, params) - -def show_images(dataset_images): - grid = torchvision.utils.make_grid(dataset_images) - img = (grid / 2 + 0.5).permute(1, 2, 0).numpy() - plt.imshow(img) - plt.draw() - plt.waitforbuttonpress(timeout=3) - - # load 5 random unseen images queryset = torchvision.datasets.CIFAR10(root=tempfile.gettempdir(), train=False, download=True, transform=transform) queryloader = torch.utils.data.DataLoader(queryset, batch_size=5, shuffle=True) images = next(iter(queryloader))[0] - # generate and query embeddings +results = [] embeddings = generate_embeddings(images) for image, embedding in zip(images, embeddings): - result = conn.execute('SELECT id FROM images ORDER BY embedding <=> %s LIMIT 15', (embedding,)).fetchall() - show_images([image] + [dataset[row[0] - 1][0] for row in result]) + result = conn.execute('SELECT id FROM images ORDER BY embedding <=> %s LIMIT 5', (embedding,)).fetchall() + nearest_images = [dataset[row[0] - 1][0] for row in result] + results.append([image] + nearest_images) + +# show images +fig, axs = plt.subplots(len(results), len(results[0])) +for i, result in enumerate(results): + for j, image in enumerate(result): + ax = axs[i, j] + ax.imshow((image / 2 + 0.5).permute(1, 2, 0).numpy()) + ax.set_axis_off() +plt.show(block=True) From 751dc1de90aaa713218fcc7b771d331c0fd4e50b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 24 Mar 2024 11:04:46 -0700 Subject: [PATCH 042/424] Added hash image search example [skip ci] --- README.md | 1 + examples/hash_image_search.py | 43 +++++++++++++++++++++++++++++++++++ examples/requirements.txt | 1 + 3 files changed, 45 insertions(+) create mode 100644 examples/hash_image_search.py diff --git a/README.md b/README.md index 8aa0411..19acac5 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Or check out some examples: - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch +- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing - [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit - [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise - [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM diff --git a/examples/hash_image_search.py b/examples/hash_image_search.py new file mode 100644 index 0000000..d30027d --- /dev/null +++ b/examples/hash_image_search.py @@ -0,0 +1,43 @@ +from datasets import load_dataset +import matplotlib.pyplot as plt +import psycopg +from imagehash import phash + + +def hash_image(img): + return ''.join(['1' if v else '0' for v in phash(img).hash.flatten()]) + + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('DROP TABLE IF EXISTS images') +conn.execute('CREATE TABLE images (id bigserial PRIMARY KEY, hash bit(64))') + +print('Loading dataset') +dataset = load_dataset('mnist') + +print('Generating hashes') +images = [{'hash': hash_image(row['image'])} for row in dataset['train']] + +print('Storing hashes') +cur = conn.cursor() +with cur.copy('COPY images (hash) FROM STDIN') as copy: + for image in images: + copy.write_row([image['hash']]) + +print('Querying hashes') +results = [] +for i in range(5): + image = dataset['test'][i]['image'] + result = conn.execute('SELECT id FROM images ORDER BY bit_count(hash # %s) LIMIT 5', (hash_image(image),)).fetchall() + nearest_images = [dataset['train'][row[0] - 1]['image'] for row in result] + results.append([image] + nearest_images) + +print('Showing results (first column is query image)') +fig, axs = plt.subplots(len(results), len(results[0])) +for i, result in enumerate(results): + for j, image in enumerate(result): + ax = axs[i, j] + ax.imshow(image) + ax.set_axis_off() +plt.show(block=True) diff --git a/examples/requirements.txt b/examples/requirements.txt index 4be7ae1..b919260 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,3 +1,4 @@ +imagehash implicit lightfm openai From 78569e0156d3674588b948d24723ad3db3762b89 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 24 Mar 2024 11:10:06 -0700 Subject: [PATCH 043/424] Updated dependencies for examples [skip ci] --- examples/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/requirements.txt b/examples/requirements.txt index b919260..6832670 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,6 +1,8 @@ +datasets imagehash implicit lightfm +matplotlib openai torch torchvision From ad32dc0f00e905c7a6a245ed20bcdbffcfe4e627 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 26 Mar 2024 12:27:47 -0700 Subject: [PATCH 044/424] Fixed warnings on CI and updated pgvector --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c39241c..0724ffe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,7 +9,7 @@ jobs: python: [3.12, 3.8] steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - run: pip install -r requirements.txt @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.6.0 https://github.com/pgvector/pgvector.git + git clone --branch v0.6.2 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install From a362227f1ff0a669942f13a4a0836b66df00713a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Apr 2024 11:07:34 -0700 Subject: [PATCH 045/424] Moved code to separate files --- pgvector/psycopg/__init__.py | 57 ++---------------------------------- pgvector/psycopg/vector.py | 55 ++++++++++++++++++++++++++++++++++ pgvector/utils/__init__.py | 50 +------------------------------ pgvector/utils/vector.py | 49 +++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 103 deletions(-) create mode 100644 pgvector/psycopg/vector.py create mode 100644 pgvector/utils/vector.py diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index bd398ec..37a5de2 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,48 +1,13 @@ import psycopg -from psycopg.adapt import Loader, Dumper -from psycopg.pq import Format from psycopg.types import TypeInfo +from .vector import * + +# TODO remove in 0.3.0 from ..utils import from_db, from_db_binary, to_db, to_db_binary __all__ = ['register_vector'] -class VectorDumper(Dumper): - - format = Format.TEXT - - def dump(self, obj): - return to_db(obj).encode("utf8") - - -class VectorBinaryDumper(VectorDumper): - - format = Format.BINARY - - def dump(self, obj): - return to_db_binary(obj) - - -class VectorLoader(Loader): - - format = Format.TEXT - - def load(self, data): - if isinstance(data, memoryview): - data = bytes(data) - return from_db(data.decode("utf8")) - - -class VectorBinaryLoader(VectorLoader): - - format = Format.BINARY - - def load(self, data): - if isinstance(data, memoryview): - data = bytes(data) - return from_db_binary(data) - - def register_vector(context): info = TypeInfo.fetch(context, 'vector') register_vector_info(context, info) @@ -51,19 +16,3 @@ def register_vector(context): async def register_vector_async(context): info = await TypeInfo.fetch(context, 'vector') register_vector_info(context, info) - - -def register_vector_info(context, info): - if info is None: - raise psycopg.ProgrammingError('vector type not found in the database') - info.register(context) - - # add oid to anonymous class for set_types - text_dumper = type('', (VectorDumper,), {'oid': info.oid}) - binary_dumper = type('', (VectorBinaryDumper,), {'oid': info.oid}) - - adapters = context.adapters - adapters.register_dumper('numpy.ndarray', text_dumper) - adapters.register_dumper('numpy.ndarray', binary_dumper) - adapters.register_loader(info.oid, VectorLoader) - adapters.register_loader(info.oid, VectorBinaryLoader) diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py new file mode 100644 index 0000000..2289298 --- /dev/null +++ b/pgvector/psycopg/vector.py @@ -0,0 +1,55 @@ +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from ..utils import from_db, from_db_binary, to_db, to_db_binary + + +class VectorDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return to_db(obj).encode("utf8") + + +class VectorBinaryDumper(VectorDumper): + + format = Format.BINARY + + def dump(self, obj): + return to_db_binary(obj) + + +class VectorLoader(Loader): + + format = Format.TEXT + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return from_db(data.decode("utf8")) + + +class VectorBinaryLoader(VectorLoader): + + format = Format.BINARY + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return from_db_binary(data) + + +def register_vector_info(context, info): + if info is None: + raise psycopg.ProgrammingError('vector type not found in the database') + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (VectorDumper,), {'oid': info.oid}) + binary_dumper = type('', (VectorBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper('numpy.ndarray', text_dumper) + adapters.register_dumper('numpy.ndarray', binary_dumper) + adapters.register_loader(info.oid, VectorLoader) + adapters.register_loader(info.oid, VectorBinaryLoader) diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 5640b60..e58cbab 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,49 +1 @@ -import numpy as np -from struct import pack, unpack - - -def from_db(value): - # could be ndarray if already cast by lower-level driver - if value is None or isinstance(value, np.ndarray): - return value - - return np.array(value[1:-1].split(','), dtype=np.float32) - - -def from_db_binary(value): - if value is None: - return value - - (dim, unused) = unpack('>HH', value[:4]) - return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) - - -def to_db(value, dim=None): - if value is None: - return value - - if isinstance(value, np.ndarray): - if value.ndim != 1: - raise ValueError('expected ndim to be 1') - - if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): - raise ValueError('dtype must be numeric') - - value = value.tolist() - - if dim is not None and len(value) != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) - - return '[' + ','.join([str(float(v)) for v in value]) + ']' - - -def to_db_binary(value): - if value is None: - return value - - value = np.asarray(value, dtype='>f') - - if value.ndim != 1: - raise ValueError('expected ndim to be 1') - - return pack('>HH', value.shape[0], 0) + value.tobytes() +from .vector import * diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py new file mode 100644 index 0000000..5640b60 --- /dev/null +++ b/pgvector/utils/vector.py @@ -0,0 +1,49 @@ +import numpy as np +from struct import pack, unpack + + +def from_db(value): + # could be ndarray if already cast by lower-level driver + if value is None or isinstance(value, np.ndarray): + return value + + return np.array(value[1:-1].split(','), dtype=np.float32) + + +def from_db_binary(value): + if value is None: + return value + + (dim, unused) = unpack('>HH', value[:4]) + return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) + + +def to_db(value, dim=None): + if value is None: + return value + + if isinstance(value, np.ndarray): + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): + raise ValueError('dtype must be numeric') + + value = value.tolist() + + if dim is not None and len(value) != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + + return '[' + ','.join([str(float(v)) for v in value]) + ']' + + +def to_db_binary(value): + if value is None: + return value + + value = np.asarray(value, dtype='>f') + + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + return pack('>HH', value.shape[0], 0) + value.tobytes() From 3dd2067fbcc2255c065ed8cc1b5dc7127739c396 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Apr 2024 12:04:06 -0700 Subject: [PATCH 046/424] Improved code --- pgvector/utils/vector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 5640b60..d831dac 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -1,5 +1,5 @@ import numpy as np -from struct import pack, unpack +from struct import pack, unpack_from def from_db(value): @@ -14,7 +14,7 @@ def from_db_binary(value): if value is None: return value - (dim, unused) = unpack('>HH', value[:4]) + dim, unused = unpack_from('>HH', value) return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) From 70fc5dfdea1c72867996f30077e95767a630b8d9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Apr 2024 12:14:02 -0700 Subject: [PATCH 047/424] Use consistent style [skip ci] --- pgvector/psycopg/vector.py | 4 ++-- pgvector/sqlalchemy/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index 2289298..aa15b92 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -8,7 +8,7 @@ class VectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return to_db(obj).encode("utf8") + return to_db(obj).encode('utf8') class VectorBinaryDumper(VectorDumper): @@ -26,7 +26,7 @@ class VectorLoader(Loader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return from_db(data.decode("utf8")) + return from_db(data.decode('utf8')) class VectorBinaryLoader(VectorLoader): diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 6220082..d788770 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -15,8 +15,8 @@ def __init__(self, dim=None): def get_col_spec(self, **kw): if self.dim is None: - return "VECTOR" - return "VECTOR(%d)" % self.dim + return 'VECTOR' + return 'VECTOR(%d)' % self.dim def bind_processor(self, dialect): def process(value): From 5ba96ffa9f4ef751e8385aa7684cd2764f5434e7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Apr 2024 12:26:59 -0700 Subject: [PATCH 048/424] Added test for bit [skip ci] --- tests/test_psycopg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 926bf45..a974808 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -71,6 +71,10 @@ def test_binary_copy_set_types(self): copy.set_types(['int8', 'vector']) copy.write_row([1, embedding]) + def test_bit(self): + res = conn.execute('SELECT %s::bit(3)', ('101',)).fetchone()[0] + assert res == '101' + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) From 3c51f396b3dd8390beae372e4a982dda2c01ae9a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 1 May 2024 16:13:29 -0700 Subject: [PATCH 049/424] Added support for halfvec and sparsevec types to Psycopg 3 --- .github/workflows/build.yml | 2 +- CHANGELOG.md | 4 +++ pgvector/psycopg/__init__.py | 22 +++++++++++++- pgvector/psycopg/halfvec.py | 57 +++++++++++++++++++++++++++++++++++ pgvector/psycopg/sparsevec.py | 57 +++++++++++++++++++++++++++++++++++ pgvector/utils/__init__.py | 2 ++ pgvector/utils/halfvec.py | 29 ++++++++++++++++++ pgvector/utils/sparsevec.py | 46 ++++++++++++++++++++++++++++ tests/test_psycopg.py | 43 ++++++++++++++++++++++++-- 9 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 pgvector/psycopg/halfvec.py create mode 100644 pgvector/psycopg/sparsevec.py create mode 100644 pgvector/utils/halfvec.py create mode 100644 pgvector/utils/sparsevec.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0724ffe..f8bcaa3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.6.2 https://github.com/pgvector/pgvector.git + git clone --branch v0.7.0 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bfc2dc..cf51599 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.6 (unreleased) + +- Added support for `halfvec` and `sparsevec` types to Psycopg 3 + ## 0.2.5 (2024-02-07) - Added literal binds support for SQLAlchemy diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index 37a5de2..3228afc 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,8 +1,12 @@ import psycopg from psycopg.types import TypeInfo -from .vector import * +from .halfvec import register_halfvec_info +from .sparsevec import register_sparsevec_info +from .vector import register_vector_info +from ..utils import HalfVec, SparseVec # TODO remove in 0.3.0 +from .vector import * from ..utils import from_db, from_db_binary, to_db, to_db_binary __all__ = ['register_vector'] @@ -12,7 +16,23 @@ def register_vector(context): info = TypeInfo.fetch(context, 'vector') register_vector_info(context, info) + info = TypeInfo.fetch(context, 'halfvec') + if info is not None: + register_halfvec_info(context, info) + + info = TypeInfo.fetch(context, 'sparsevec') + if info is not None: + register_sparsevec_info(context, info) + async def register_vector_async(context): info = await TypeInfo.fetch(context, 'vector') register_vector_info(context, info) + + info = await TypeInfo.fetch(context, 'halfvec') + if info is not None: + register_halfvec_info(context, info) + + info = await TypeInfo.fetch(context, 'sparsevec') + if info is not None: + register_sparsevec_info(context, info) diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py new file mode 100644 index 0000000..3a159d6 --- /dev/null +++ b/pgvector/psycopg/halfvec.py @@ -0,0 +1,57 @@ +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from ..utils import HalfVec + + +class HalfVecDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return obj.to_db().encode('utf8') + + +class HalfVecBinaryDumper(HalfVecDumper): + + format = Format.BINARY + + def dump(self, obj): + return obj.to_db_binary() + + +class HalfVecLoader(Loader): + + format = Format.TEXT + + def load(self, data): + if data is None: + return None + if isinstance(data, memoryview): + data = bytes(data) + return HalfVec.from_db(data.decode('utf8')) + + +class HalfVecBinaryLoader(HalfVecLoader): + + format = Format.BINARY + + def load(self, data): + if data is None: + return None + if isinstance(data, memoryview): + data = bytes(data) + return HalfVec.from_db_binary(data) + + +def register_halfvec_info(context, info): + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (HalfVecDumper,), {'oid': info.oid}) + binary_dumper = type('', (HalfVecBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper(HalfVec, text_dumper) + adapters.register_dumper(HalfVec, binary_dumper) + adapters.register_loader(info.oid, HalfVecLoader) + adapters.register_loader(info.oid, HalfVecBinaryLoader) diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py new file mode 100644 index 0000000..18f6e1a --- /dev/null +++ b/pgvector/psycopg/sparsevec.py @@ -0,0 +1,57 @@ +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from ..utils import SparseVec + + +class SparseVecDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return obj.to_db().encode('utf8') + + +class SparseVecBinaryDumper(SparseVecDumper): + + format = Format.BINARY + + def dump(self, obj): + return obj.to_db_binary() + + +class SparseVecLoader(Loader): + + format = Format.TEXT + + def load(self, data): + if data is None: + return None + if isinstance(data, memoryview): + data = bytes(data) + return SparseVec.from_db(data.decode('utf8')) + + +class SparseVecBinaryLoader(SparseVecLoader): + + format = Format.BINARY + + def load(self, data): + if data is None: + return None + if isinstance(data, memoryview): + data = bytes(data) + return SparseVec.from_db_binary(data) + + +def register_sparsevec_info(context, info): + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (SparseVecDumper,), {'oid': info.oid}) + binary_dumper = type('', (SparseVecBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper(SparseVec, text_dumper) + adapters.register_dumper(SparseVec, binary_dumper) + adapters.register_loader(info.oid, SparseVecLoader) + adapters.register_loader(info.oid, SparseVecBinaryLoader) diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index e58cbab..897862b 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1 +1,3 @@ +from .halfvec import * +from .sparsevec import * from .vector import * diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py new file mode 100644 index 0000000..262d3ad --- /dev/null +++ b/pgvector/utils/halfvec.py @@ -0,0 +1,29 @@ +from struct import pack, unpack_from + + +class HalfVec: + def __init__(self, value): + # TODO support np.array + if not isinstance(value, (list, tuple)): + raise ValueError('expected list or tuple') + + self.value = value + + def to_list(self): + return list(self.value) + + def to_db(self): + return '[' + ','.join([str(float(v)) for v in self.value]) + ']' + + def to_db_binary(self): + return pack(f'>HH{len(self.value)}e', len(self.value), 0, *self.value) + + def from_db(value): + return HalfVec([float(v) for v in value[1:-1].split(',')]) + + def from_db_binary(value): + dim, unused = unpack_from('>HH', value) + return HalfVec(unpack_from(f'>{dim}e', value, 4)) + + def __repr__(self): + return f'HalfVec({self.value})' diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py new file mode 100644 index 0000000..72ec0e4 --- /dev/null +++ b/pgvector/utils/sparsevec.py @@ -0,0 +1,46 @@ +from struct import pack, unpack_from + + +class SparseVec: + def __init__(self, dim, indices, values): + self.dim = dim + self.indices = indices + self.values = values + + def from_dense(value): + dim = len(value) + indices = [i for i, v in enumerate(value) if v != 0] + values = [value[i] for i in indices] + return SparseVec(dim, indices, values) + + def to_dense(self): + vec = [0] * self.dim + for i, v in zip(self.indices, self.values): + vec[i] = v + return vec + + def to_db(self): + return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(self.indices, self.values)]) + '}/' + str(self.dim) + + def to_db_binary(self): + nnz = len(self.indices) + return pack(f'>iii{nnz}i{nnz}f', self.dim, nnz, 0, *self.indices, *self.values) + + def from_db(value): + elements, dim = value.split('/') + indices = [] + values = [] + for e in elements[1:-1].split(','): + i, v = e.split(':') + indices.append(int(i) - 1) + values.append(float(v)) + return SparseVec(int(dim), indices, values) + + def from_db_binary(value): + dim, nnz, unused = unpack_from('>iii', value) + indices = unpack_from(f'>{nnz}i', value, 12) + values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) + return SparseVec(int(dim), indices, values) + + def __repr__(self): + return f'SparseVec({self.dim}, {self.indices}, {self.values})' diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index a974808..9a7f611 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg import register_vector, register_vector_async +from pgvector.psycopg import register_vector, register_vector_async, HalfVec, SparseVec import psycopg import pytest @@ -27,7 +27,7 @@ def test_works(self): def test_binary_format(self): embedding = np.array([1.5, 2, 3]) - res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] + res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] assert np.array_equal(res, embedding) def test_text_format(self): @@ -71,6 +71,45 @@ def test_binary_copy_set_types(self): copy.set_types(['int8', 'vector']) copy.write_row([1, embedding]) + def test_halfvec(self): + conn.execute('DROP TABLE IF EXISTS half_items') + conn.execute('CREATE TABLE half_items (id bigserial PRIMARY KEY, embedding halfvec(3))') + + embedding = HalfVec([1.5, 2, 3]) + conn.execute('INSERT INTO half_items (embedding) VALUES (%s)', (embedding,)) + + res = conn.execute('SELECT * FROM half_items ORDER BY id').fetchall() + + def test_halfvec_binary_format(self): + embedding = HalfVec([1.5, 2, 3]) + res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] + assert res.to_list() == [1.5, 2, 3] + + def test_halfvec_text_format(self): + embedding = HalfVec([1.5, 2, 3]) + res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] + assert res.to_list() == [1.5, 2, 3] + + def test_sparsevec(self): + conn.execute('DROP TABLE IF EXISTS sparse_items') + conn.execute('CREATE TABLE sparse_items (id bigserial PRIMARY KEY, embedding sparsevec(6))') + + embedding = SparseVec.from_dense([0, 1.5, 0, 2, 0, 3]) + conn.execute('INSERT INTO sparse_items (embedding) VALUES (%s)', (embedding,)) + + res = conn.execute('SELECT * FROM sparse_items ORDER BY id').fetchall() + assert res[0][1].to_dense() == [0, 1.5, 0, 2, 0, 3] + + def test_sparsevec_binary_format(self): + embedding = SparseVec.from_dense([1.5, 2, 3]) + res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] + assert res.to_dense() == [1.5, 2, 3] + + def test_sparsevec_text_format(self): + embedding = SparseVec.from_dense([1.5, 2, 3]) + res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] + assert res.to_dense() == [1.5, 2, 3] + def test_bit(self): res = conn.execute('SELECT %s::bit(3)', ('101',)).fetchone()[0] assert res == '101' From 62c69fca1b63faa7500350cc8044bdce78cca96c Mon Sep 17 00:00:00 2001 From: Domenico Date: Thu, 16 May 2024 17:54:41 +0200 Subject: [PATCH 050/424] L1 distance support for sqlalchemy and sqlmodel (#69) --- pgvector/sqlalchemy/__init__.py | 3 +++ tests/test_sqlalchemy.py | 12 ++++++++++++ tests/test_sqlmodel.py | 6 ++++++ 3 files changed, 21 insertions(+) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index d788770..9bd6ccc 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -39,6 +39,9 @@ class comparator_factory(UserDefinedType.Comparator): def l2_distance(self, other): return self.op('<->', return_type=Float)(other) + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + def max_inner_product(self, other): return self.op('<#>', return_type=Float)(other) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index b79f835..e5ce4ce 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -119,6 +119,18 @@ def test_l2_distance_orm(self): items = session.scalars(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 2])).all() + assert [v.id for v in items] == [3, 1, 2] + + def test_l1_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 2]))) + assert [v.id for v in items] == [3, 1, 2] + def test_max_inner_product(self): create_items() with Session(engine) as session: diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 71fa801..fbd4410 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -79,6 +79,12 @@ def test_l2_distance(self): items = session.exec(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_max_inner_product(self): create_items() with Session(engine) as session: From 5a530ee5478adb1b63e1d9edbaa817f87a22a067 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 11:57:22 -0400 Subject: [PATCH 051/424] Moved l1_distance function and tests to be consistent with other libraries --- CHANGELOG.md | 1 + pgvector/sqlalchemy/__init__.py | 6 +++--- tests/test_sqlalchemy.py | 24 ++++++++++++------------ tests/test_sqlmodel.py | 12 ++++++------ 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf51599..2fe2ce1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.6 (unreleased) - Added support for `halfvec` and `sparsevec` types to Psycopg 3 +- Added `l1_distance` for SQLAlchemy and SQLModel ## 0.2.5 (2024-02-07) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 9bd6ccc..e035e79 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -39,15 +39,15 @@ class comparator_factory(UserDefinedType.Comparator): def l2_distance(self, other): return self.op('<->', return_type=Float)(other) - def l1_distance(self, other): - return self.op('<+>', return_type=Float)(other) - def max_inner_product(self, other): return self.op('<#>', return_type=Float)(other) def cosine_distance(self, other): return self.op('<=>', return_type=Float)(other) + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + # for reflection ischema_names['vector'] = Vector diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index e5ce4ce..5db608f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -119,18 +119,6 @@ def test_l2_distance_orm(self): items = session.scalars(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_l1_distance(self): - create_items() - with Session(engine) as session: - items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 2])).all() - assert [v.id for v in items] == [3, 1, 2] - - def test_l1_distance_orm(self): - create_items() - with Session(engine) as session: - items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 2]))) - assert [v.id for v in items] == [3, 1, 2] - def test_max_inner_product(self): create_items() with Session(engine) as session: @@ -155,6 +143,18 @@ def test_cosine_distance_orm(self): items = session.scalars(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] + def test_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_l1_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_filter(self): create_items() with Session(engine) as session: diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index fbd4410..8d349ea 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -79,12 +79,6 @@ def test_l2_distance(self): items = session.exec(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_l1_distance(self): - create_items() - with Session(engine) as session: - items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) - assert [v.id for v in items] == [1, 3, 2] - def test_max_inner_product(self): create_items() with Session(engine) as session: @@ -97,6 +91,12 @@ def test_cosine_distance(self): items = session.exec(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] + def test_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_filter(self): create_items() with Session(engine) as session: From 2a0eeb47fe9b818495f091001f2b412d66f31d23 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 12:00:39 -0400 Subject: [PATCH 052/424] Added L1Distance for Django --- CHANGELOG.md | 1 + pgvector/django/__init__.py | 7 ++++++- tests/test_django.py | 9 ++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fe2ce1..b9087a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.6 (unreleased) - Added support for `halfvec` and `sparsevec` types to Psycopg 3 +- Added `L1Distance` for Django - Added `l1_distance` for SQLAlchemy and SQLModel ## 0.2.5 (2024-02-07) diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index ab250e6..e4c8e21 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -5,7 +5,7 @@ from .forms import VectorFormField from ..utils import from_db, to_db -__all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance'] +__all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] class VectorExtension(CreateExtension): @@ -128,3 +128,8 @@ class MaxInnerProduct(DistanceBase): class CosineDistance(DistanceBase): function = '' arg_joiner = ' <=> ' + + +class L1Distance(DistanceBase): + function = '' + arg_joiner = ' <+> ' diff --git a/tests/test_django.py b/tests/test_django.py index b68b5f2..b6274cd 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance +from pgvector.django import VectorExtension, VectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance from unittest import mock settings.configure( @@ -131,6 +131,13 @@ def test_cosine_distance(self): assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + def test_l1_distance(self): + create_items() + distance = L1Distance('embedding', [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_filter(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) From f489f56414da1b505175fbfab68ca9e94f56860b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 12:02:48 -0400 Subject: [PATCH 053/424] Added l1_distance for Peewee --- CHANGELOG.md | 2 +- pgvector/peewee/__init__.py | 3 +++ tests/test_peewee.py | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9087a1..b1108c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added `L1Distance` for Django -- Added `l1_distance` for SQLAlchemy and SQLModel +- Added `l1_distance` for SQLAlchemy, SQLModel, and Peewee ## 0.2.5 (2024-02-07) diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 148f8b3..756959d 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -29,3 +29,6 @@ def max_inner_product(self, vector): def cosine_distance(self, vector): return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 38acb7c..becbb58 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -64,6 +64,13 @@ def test_cosine_distance(self): assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + def test_l1_distance(self): + create_items() + distance = Item.embedding.l1_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_where(self): create_items() items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) From 7add3381143806bbdb7fa78d8ec886112a89ceab Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 12:21:01 -0400 Subject: [PATCH 054/424] Updated license year [skip ci] --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index b3134ac..d205f4e 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2021-2023 Andrew Kane +Copyright (c) 2021-2024 Andrew Kane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 03e8e3f7ec1de13d50590e7f0c5c5c88c1696658 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 12:22:05 -0400 Subject: [PATCH 055/424] Updated changelog [skip ci] --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1108c2..1d2ae09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,8 +37,8 @@ ## 0.1.7 (2023-05-11) -- Added `register_vector_async` for psycopg3 -- Fixed `set_types` for psycopg3 +- Added `register_vector_async` for Psycopg 3 +- Fixed `set_types` for Psycopg 3 ## 0.1.6 (2022-05-22) @@ -51,12 +51,12 @@ ## 0.1.4 (2021-10-12) -- Updated psycopg3 integration for 3.0 release (no longer experimental) +- Updated Psycopg 3 integration for 3.0 release (no longer experimental) ## 0.1.3 (2021-06-22) - Added support for asyncpg -- Added experimental support for psycopg3 +- Added experimental support for Psycopg 3 ## 0.1.2 (2021-06-13) From 03f7ea82df9f2c195e08754b6e7b3df8ad1460a3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 16:11:51 -0400 Subject: [PATCH 056/424] Added support for halfvec and sparsevec types to asyncpg --- CHANGELOG.md | 1 + pgvector/asyncpg/__init__.py | 17 +++++++++++- tests/test_asyncpg.py | 52 ++++++++++++++++++++++++++++++++++-- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2ae09..591a3d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.6 (unreleased) - Added support for `halfvec` and `sparsevec` types to Psycopg 3 +- Added support for `halfvec` and `sparsevec` types to asyncpg - Added `L1Distance` for Django - Added `l1_distance` for SQLAlchemy, SQLModel, and Peewee diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 5797327..9603ac5 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,4 +1,4 @@ -from ..utils import from_db, from_db_binary, to_db, to_db_binary +from ..utils import from_db, from_db_binary, to_db, to_db_binary, HalfVec, SparseVec __all__ = ['register_vector'] @@ -10,3 +10,18 @@ async def register_vector(conn): decoder=from_db_binary, format='binary' ) + + await conn.set_type_codec( + 'halfvec', + encoder=lambda v: HalfVec(v).to_db_binary(), + decoder=HalfVec.from_db_binary, + format='binary' + ) + + await conn.set_type_codec( + 'sparsevec', + # TODO fix + encoder=lambda v: isinstance(v, SparseVec) and v.to_db_binary(), + decoder=SparseVec.from_db_binary, + format='binary' + ) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index f06ac3d..28cbacd 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,13 +1,13 @@ import asyncio import asyncpg import numpy as np -from pgvector.asyncpg import register_vector +from pgvector.asyncpg import register_vector, SparseVec import pytest class TestAsyncpg: @pytest.mark.asyncio - async def test_works(self): + async def test_vector(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') await conn.execute('DROP TABLE IF EXISTS items') @@ -31,6 +31,54 @@ async def test_works(self): await conn.close() + @pytest.mark.asyncio + async def test_halfvec(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS items') + await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding halfvec(3))') + + await register_vector(conn) + + embedding = [1.5, 2, 3] + await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + + res = await conn.fetch("SELECT * FROM items ORDER BY id") + assert res[0]['id'] == 1 + assert res[1]['id'] == 2 + assert res[0]['embedding'].to_list() == [1.5, 2, 3] + assert res[1]['embedding'] is None + + # ensures binary format is correct + text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + assert text_res[0]['embedding'] == '[1.5,2,3]' + + await conn.close() + + @pytest.mark.asyncio + async def test_sparsevec(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS items') + await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding sparsevec(3))') + + await register_vector(conn) + + embedding = SparseVec.from_dense([1.5, 2, 3]) + await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + + res = await conn.fetch("SELECT * FROM items ORDER BY id") + assert res[0]['id'] == 1 + assert res[1]['id'] == 2 + assert res[0]['embedding'].to_dense() == [1.5, 2, 3] + assert res[1]['embedding'] is None + + # ensures binary format is correct + text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + assert text_res[0]['embedding'] == '{1:1.5,2:2,3:3}/3' + + await conn.close() + @pytest.mark.asyncio async def test_pool(self): async def init(conn): From 8ef3e0dd2a7af8a76c7c35b2e06d2739335569dc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 16:16:06 -0400 Subject: [PATCH 057/424] Moved VectorField to separate file [skip ci] --- pgvector/peewee/__init__.py | 34 +--------------------------------- pgvector/peewee/vector.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 33 deletions(-) create mode 100644 pgvector/peewee/vector.py diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 756959d..ec4b615 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,34 +1,2 @@ -from peewee import Expression, Field, Value from ..utils import from_db, to_db - - -class VectorField(Field): - field_type = 'vector' - - def __init__(self, dimensions=None, *args, **kwargs): - self.dimensions = dimensions - super(VectorField, self).__init__(*args, **kwargs) - - def get_modifiers(self): - return self.dimensions and [self.dimensions] or None - - def db_value(self, value): - return to_db(value) - - def python_value(self, value): - return from_db(value) - - def _distance(self, op, vector): - return Expression(lhs=self, op=op, rhs=self.to_value(vector)) - - def l2_distance(self, vector): - return self._distance('<->', vector) - - def max_inner_product(self, vector): - return self._distance('<#>', vector) - - def cosine_distance(self, vector): - return self._distance('<=>', vector) - - def l1_distance(self, vector): - return self._distance('<+>', vector) +from .vector import VectorField diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py new file mode 100644 index 0000000..756959d --- /dev/null +++ b/pgvector/peewee/vector.py @@ -0,0 +1,34 @@ +from peewee import Expression, Field, Value +from ..utils import from_db, to_db + + +class VectorField(Field): + field_type = 'vector' + + def __init__(self, dimensions=None, *args, **kwargs): + self.dimensions = dimensions + super(VectorField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.dimensions and [self.dimensions] or None + + def db_value(self, value): + return to_db(value) + + def python_value(self, value): + return from_db(value) + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def l2_distance(self, vector): + return self._distance('<->', vector) + + def max_inner_product(self, vector): + return self._distance('<#>', vector) + + def cosine_distance(self, vector): + return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) From 3ee6b94571189e5721170c7d02fba1cbece0ab33 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 16:21:40 -0400 Subject: [PATCH 058/424] Added support for halfvec type to Peewee [skip ci] --- CHANGELOG.md | 1 + pgvector/peewee/__init__.py | 1 + pgvector/peewee/halfvec.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_peewee.py | 14 +++++++++++--- 4 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 pgvector/peewee/halfvec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 591a3d1..c9826d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to asyncpg +- Added support for `halfvec` type to Peewee - Added `L1Distance` for Django - Added `l1_distance` for SQLAlchemy, SQLModel, and Peewee diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index ec4b615..c79cbf9 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,2 +1,3 @@ from ..utils import from_db, to_db +from .halfvec import HalfvecField from .vector import VectorField diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py new file mode 100644 index 0000000..8d8d176 --- /dev/null +++ b/pgvector/peewee/halfvec.py @@ -0,0 +1,35 @@ +from peewee import Expression, Field, Value +# TODO use halfvec functions +from ..utils import from_db, to_db + + +class HalfvecField(Field): + field_type = 'halfvec' + + def __init__(self, dimensions=None, *args, **kwargs): + self.dimensions = dimensions + super(HalfvecField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.dimensions and [self.dimensions] or None + + def db_value(self, value): + return to_db(value) + + def python_value(self, value): + return from_db(value) + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def l2_distance(self, vector): + return self._distance('<->', vector) + + def max_inner_product(self, vector): + return self._distance('<#>', vector) + + def cosine_distance(self, vector): + return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index becbb58..970ee41 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField +from pgvector.peewee import VectorField, HalfvecField db = PostgresqlDatabase('pgvector_python_test') @@ -12,7 +12,8 @@ class Meta: class Item(BaseModel): - embedding = VectorField(dimensions=3) + embedding = VectorField(dimensions=3, null=True) + half_embedding = HalfvecField(dimensions=3, null=True) Item.add_index('embedding vector_l2_ops', using='hnsw') @@ -30,7 +31,7 @@ def create_items(): [1, 1, 2] ] for i, v in enumerate(vectors): - Item.create(id=i + 1, embedding=v) + Item.create(id=i + 1, embedding=v, half_embedding=v) class TestPeewee: @@ -71,6 +72,13 @@ def test_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_halfvec_l2_distance(self): + create_items() + distance = Item.half_embedding.l2_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_where(self): create_items() items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) From bd9d7d68d6b6a81fcbc10b2b7804fa775057eee0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 16:28:42 -0400 Subject: [PATCH 059/424] Added support for sparsevec type to Peewee [skip ci] --- CHANGELOG.md | 2 +- pgvector/peewee/__init__.py | 3 ++- pgvector/peewee/halfvec.py | 9 +++++---- pgvector/peewee/sparsevec.py | 37 ++++++++++++++++++++++++++++++++++++ tests/test_peewee.py | 12 ++++++++++-- 5 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 pgvector/peewee/sparsevec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c9826d3..c48f5ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to asyncpg -- Added support for `halfvec` type to Peewee +- Added support for `halfvec` and `sparsevec` types to Peewee - Added `L1Distance` for Django - Added `l1_distance` for SQLAlchemy, SQLModel, and Peewee diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index c79cbf9..106170f 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,3 +1,4 @@ -from ..utils import from_db, to_db +from ..utils import from_db, to_db, SparseVec from .halfvec import HalfvecField +from .sparsevec import SparsevecField from .vector import VectorField diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index 8d8d176..896bffd 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -1,6 +1,5 @@ from peewee import Expression, Field, Value -# TODO use halfvec functions -from ..utils import from_db, to_db +from ..utils import HalfVec class HalfvecField(Field): @@ -14,10 +13,12 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return to_db(value) + return HalfVec(value).to_db() def python_value(self, value): - return from_db(value) + if value is None: + return value + return HalfVec.from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py new file mode 100644 index 0000000..e171e63 --- /dev/null +++ b/pgvector/peewee/sparsevec.py @@ -0,0 +1,37 @@ +from peewee import Expression, Field, Value +from ..utils import SparseVec + + +class SparsevecField(Field): + field_type = 'sparsevec' + + def __init__(self, dimensions=None, *args, **kwargs): + self.dimensions = dimensions + super(SparsevecField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.dimensions and [self.dimensions] or None + + def db_value(self, value): + # TODO improve + return value.to_db() + + def python_value(self, value): + if value is None: + return value + return SparseVec.from_db(value) + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def l2_distance(self, vector): + return self._distance('<->', vector) + + def max_inner_product(self, vector): + return self._distance('<#>', vector) + + def cosine_distance(self, vector): + return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 970ee41..e429f8c 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField, HalfvecField +from pgvector.peewee import VectorField, HalfvecField, SparsevecField, SparseVec db = PostgresqlDatabase('pgvector_python_test') @@ -14,6 +14,7 @@ class Meta: class Item(BaseModel): embedding = VectorField(dimensions=3, null=True) half_embedding = HalfvecField(dimensions=3, null=True) + sparse_embedding = SparsevecField(dimensions=3, null=True) Item.add_index('embedding vector_l2_ops', using='hnsw') @@ -31,7 +32,7 @@ def create_items(): [1, 1, 2] ] for i, v in enumerate(vectors): - Item.create(id=i + 1, embedding=v, half_embedding=v) + Item.create(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v)) class TestPeewee: @@ -79,6 +80,13 @@ def test_halfvec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_sparsevec_l2_distance(self): + create_items() + distance = Item.sparse_embedding.l2_distance(SparseVec.from_dense([1, 1, 1])) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_where(self): create_items() items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) From afb416f947a21c08738857da68dfc7cca724926a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 16:31:04 -0400 Subject: [PATCH 060/424] Moved Vector to separate file for SQLAlchemy [skip ci] --- pgvector/sqlalchemy/__init__.py | 52 +-------------------------------- pgvector/sqlalchemy/vector.py | 51 ++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 51 deletions(-) create mode 100644 pgvector/sqlalchemy/vector.py diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index e035e79..65193c6 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,53 +1,3 @@ -from sqlalchemy.dialects.postgresql.base import ischema_names -from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import from_db, to_db +from .vector import Vector __all__ = ['Vector'] - - -class Vector(UserDefinedType): - cache_ok = True - _string = String() - - def __init__(self, dim=None): - super(UserDefinedType, self).__init__() - self.dim = dim - - def get_col_spec(self, **kw): - if self.dim is None: - return 'VECTOR' - return 'VECTOR(%d)' % self.dim - - def bind_processor(self, dialect): - def process(value): - return to_db(value, self.dim) - return process - - def literal_processor(self, dialect): - string_literal_processor = self._string._cached_literal_processor(dialect) - - def process(value): - return string_literal_processor(to_db(value, self.dim)) - return process - - def result_processor(self, dialect, coltype): - def process(value): - return from_db(value) - return process - - class comparator_factory(UserDefinedType.Comparator): - def l2_distance(self, other): - return self.op('<->', return_type=Float)(other) - - def max_inner_product(self, other): - return self.op('<#>', return_type=Float)(other) - - def cosine_distance(self, other): - return self.op('<=>', return_type=Float)(other) - - def l1_distance(self, other): - return self.op('<+>', return_type=Float)(other) - - -# for reflection -ischema_names['vector'] = Vector diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py new file mode 100644 index 0000000..01f1f24 --- /dev/null +++ b/pgvector/sqlalchemy/vector.py @@ -0,0 +1,51 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float, String +from ..utils import from_db, to_db + + +class Vector(UserDefinedType): + cache_ok = True + _string = String() + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + if self.dim is None: + return 'VECTOR' + return 'VECTOR(%d)' % self.dim + + def bind_processor(self, dialect): + def process(value): + return to_db(value, self.dim) + return process + + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + + def process(value): + return string_literal_processor(to_db(value, self.dim)) + return process + + def result_processor(self, dialect, coltype): + def process(value): + return from_db(value) + return process + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op('<->', return_type=Float)(other) + + def max_inner_product(self, other): + return self.op('<#>', return_type=Float)(other) + + def cosine_distance(self, other): + return self.op('<=>', return_type=Float)(other) + + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + + +# for reflection +ischema_names['vector'] = Vector From 97e28a58b97e5fd347cd4e64a260792b634f239f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:27:21 -0400 Subject: [PATCH 061/424] Improved code --- pgvector/asyncpg/__init__.py | 5 ++--- pgvector/peewee/halfvec.py | 4 +--- pgvector/peewee/sparsevec.py | 5 +---- pgvector/psycopg/halfvec.py | 8 ++------ pgvector/psycopg/sparsevec.py | 8 ++------ pgvector/utils/halfvec.py | 22 +++++++++++++++++----- pgvector/utils/sparsevec.py | 18 +++++++++++++----- 7 files changed, 38 insertions(+), 32 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 9603ac5..c8fc6b2 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -13,15 +13,14 @@ async def register_vector(conn): await conn.set_type_codec( 'halfvec', - encoder=lambda v: HalfVec(v).to_db_binary(), + encoder=HalfVec.to_db_binary, decoder=HalfVec.from_db_binary, format='binary' ) await conn.set_type_codec( 'sparsevec', - # TODO fix - encoder=lambda v: isinstance(v, SparseVec) and v.to_db_binary(), + encoder=SparseVec.to_db_binary, decoder=SparseVec.from_db_binary, format='binary' ) diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index 896bffd..60edd83 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -13,11 +13,9 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return HalfVec(value).to_db() + return HalfVec.to_db(value) def python_value(self, value): - if value is None: - return value return HalfVec.from_db(value) def _distance(self, op, vector): diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index e171e63..c44d4fe 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -13,12 +13,9 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - # TODO improve - return value.to_db() + return SparseVec.to_db(value) def python_value(self, value): - if value is None: - return value return SparseVec.from_db(value) def _distance(self, op, vector): diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py index 3a159d6..023586a 100644 --- a/pgvector/psycopg/halfvec.py +++ b/pgvector/psycopg/halfvec.py @@ -8,7 +8,7 @@ class HalfVecDumper(Dumper): format = Format.TEXT def dump(self, obj): - return obj.to_db().encode('utf8') + return HalfVec.to_db(obj).encode('utf8') class HalfVecBinaryDumper(HalfVecDumper): @@ -16,7 +16,7 @@ class HalfVecBinaryDumper(HalfVecDumper): format = Format.BINARY def dump(self, obj): - return obj.to_db_binary() + return HalfVec.to_db_binary(obj) class HalfVecLoader(Loader): @@ -24,8 +24,6 @@ class HalfVecLoader(Loader): format = Format.TEXT def load(self, data): - if data is None: - return None if isinstance(data, memoryview): data = bytes(data) return HalfVec.from_db(data.decode('utf8')) @@ -36,8 +34,6 @@ class HalfVecBinaryLoader(HalfVecLoader): format = Format.BINARY def load(self, data): - if data is None: - return None if isinstance(data, memoryview): data = bytes(data) return HalfVec.from_db_binary(data) diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py index 18f6e1a..a5dec21 100644 --- a/pgvector/psycopg/sparsevec.py +++ b/pgvector/psycopg/sparsevec.py @@ -8,7 +8,7 @@ class SparseVecDumper(Dumper): format = Format.TEXT def dump(self, obj): - return obj.to_db().encode('utf8') + return SparseVec.to_db(obj).encode('utf8') class SparseVecBinaryDumper(SparseVecDumper): @@ -16,7 +16,7 @@ class SparseVecBinaryDumper(SparseVecDumper): format = Format.BINARY def dump(self, obj): - return obj.to_db_binary() + return SparseVec.to_db_binary(obj) class SparseVecLoader(Loader): @@ -24,8 +24,6 @@ class SparseVecLoader(Loader): format = Format.TEXT def load(self, data): - if data is None: - return None if isinstance(data, memoryview): data = bytes(data) return SparseVec.from_db(data.decode('utf8')) @@ -36,8 +34,6 @@ class SparseVecBinaryLoader(SparseVecLoader): format = Format.BINARY def load(self, data): - if data is None: - return None if isinstance(data, memoryview): data = bytes(data) return SparseVec.from_db_binary(data) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 262d3ad..36b1e4c 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -12,16 +12,28 @@ def __init__(self, value): def to_list(self): return list(self.value) - def to_db(self): - return '[' + ','.join([str(float(v)) for v in self.value]) + ']' - - def to_db_binary(self): - return pack(f'>HH{len(self.value)}e', len(self.value), 0, *self.value) + def to_db(value): + if value is None: + return value + if isinstance(value, HalfVec): + value = value.value + return '[' + ','.join([str(float(v)) for v in value]) + ']' + + def to_db_binary(value): + if value is None: + return value + if isinstance(value, HalfVec): + value = value.value + return pack(f'>HH{len(value)}e', len(value), 0, *value) def from_db(value): + if value is None: + return value return HalfVec([float(v) for v in value[1:-1].split(',')]) def from_db_binary(value): + if value is None: + return value dim, unused = unpack_from('>HH', value) return HalfVec(unpack_from(f'>{dim}e', value, 4)) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 72ec0e4..7e4cb6b 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -19,14 +19,20 @@ def to_dense(self): vec[i] = v return vec - def to_db(self): - return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(self.indices, self.values)]) + '}/' + str(self.dim) + def to_db(value): + if value is None: + return value + return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(value.indices, value.values)]) + '}/' + str(value.dim) - def to_db_binary(self): - nnz = len(self.indices) - return pack(f'>iii{nnz}i{nnz}f', self.dim, nnz, 0, *self.indices, *self.values) + def to_db_binary(value): + if value is None: + return value + nnz = len(value.indices) + return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) def from_db(value): + if value is None: + return value elements, dim = value.split('/') indices = [] values = [] @@ -37,6 +43,8 @@ def from_db(value): return SparseVec(int(dim), indices, values) def from_db_binary(value): + if value is None: + return value dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) From 4f07066ad0e999dcf45019f6857fcc733d89c592 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:31:57 -0400 Subject: [PATCH 062/424] Moved VectorAdapter to separate file for Psycopg 2 [skip ci] --- pgvector/psycopg2/__init__.py | 19 ++----------------- pgvector/psycopg2/vector.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 17 deletions(-) create mode 100644 pgvector/psycopg2/vector.py diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 94cc081..b51be42 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,23 +1,10 @@ -import numpy as np import psycopg2 -from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from .vector import register_vector_info from ..utils import from_db, to_db __all__ = ['register_vector'] -class VectorAdapter(object): - def __init__(self, vector): - self._vector = vector - - def getquoted(self): - return adapt(to_db(self._vector)).getquoted() - - -def cast_vector(value, cur): - return from_db(value) - - def register_vector(conn_or_curs=None): cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs @@ -27,6 +14,4 @@ def register_vector(conn_or_curs=None): except psycopg2.errors.UndefinedObject: raise psycopg2.ProgrammingError('vector type not found in the database') - vector = new_type((oid,), 'VECTOR', cast_vector) - register_type(vector) - register_adapter(np.ndarray, VectorAdapter) + register_vector_info(oid) diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py new file mode 100644 index 0000000..f314948 --- /dev/null +++ b/pgvector/psycopg2/vector.py @@ -0,0 +1,21 @@ +import numpy as np +from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from ..utils import from_db, to_db + + +class VectorAdapter(object): + def __init__(self, vector): + self._vector = vector + + def getquoted(self): + return adapt(to_db(self._vector)).getquoted() + + +def cast_vector(value, cur): + return from_db(value) + + +def register_vector_info(oid): + vector = new_type((oid,), 'VECTOR', cast_vector) + register_type(vector) + register_adapter(np.ndarray, VectorAdapter) From 73ccb192bfb443d51674c82c1083e03a1a2c06a3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:40:55 -0400 Subject: [PATCH 063/424] Added support for halfvec type to Psycopg 2 [skip ci] --- CHANGELOG.md | 1 + pgvector/psycopg2/__init__.py | 9 +++++++-- pgvector/psycopg2/halfvec.py | 20 ++++++++++++++++++++ pgvector/utils/halfvec.py | 4 ++-- pgvector/utils/sparsevec.py | 4 ++-- tests/test_psycopg2.py | 21 +++++++++++++++------ 6 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 pgvector/psycopg2/halfvec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c48f5ad..2c93317 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.6 (unreleased) - Added support for `halfvec` and `sparsevec` types to Psycopg 3 +- Added support for `halfvec` type to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg - Added support for `halfvec` and `sparsevec` types to Peewee - Added `L1Distance` for Django diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index b51be42..bde04e5 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,4 +1,5 @@ import psycopg2 +from .halfvec import register_halfvec_info from .vector import register_vector_info from ..utils import from_db, to_db @@ -10,8 +11,12 @@ def register_vector(conn_or_curs=None): try: cur.execute('SELECT NULL::vector') - oid = cur.description[0][1] + register_vector_info(cur.description[0][1]) except psycopg2.errors.UndefinedObject: raise psycopg2.ProgrammingError('vector type not found in the database') - register_vector_info(oid) + try: + cur.execute('SELECT NULL::halfvec') + register_halfvec_info(cur.description[0][1]) + except psycopg2.errors.UndefinedObject: + pass diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py new file mode 100644 index 0000000..dc87ba1 --- /dev/null +++ b/pgvector/psycopg2/halfvec.py @@ -0,0 +1,20 @@ +from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from ..utils import HalfVec + + +class HalfvecAdapter(object): + def __init__(self, halfvec): + self._halfvec = halfvec + + def getquoted(self): + return adapt(HalfVec.to_db(self._halfvec)).getquoted() + + +def cast_halfvec(value, cur): + return HalfVec.from_db(value) + + +def register_halfvec_info(oid): + halfvec = new_type((oid,), 'HALFVEC', cast_halfvec) + register_type(halfvec) + register_adapter(HalfVec, HalfvecAdapter) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 36b1e4c..627c1c6 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -27,12 +27,12 @@ def to_db_binary(value): return pack(f'>HH{len(value)}e', len(value), 0, *value) def from_db(value): - if value is None: + if value is None or isinstance(value, HalfVec): return value return HalfVec([float(v) for v in value[1:-1].split(',')]) def from_db_binary(value): - if value is None: + if value is None or isinstance(value, HalfVec): return value dim, unused = unpack_from('>HH', value) return HalfVec(unpack_from(f'>{dim}e', value, 4)) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 7e4cb6b..f7f6607 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -31,7 +31,7 @@ def to_db_binary(value): return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) def from_db(value): - if value is None: + if value is None or isinstance(value, SparseVec): return value elements, dim = value.split('/') indices = [] @@ -43,7 +43,7 @@ def from_db(value): return SparseVec(int(dim), indices, values) def from_db_binary(value): - if value is None: + if value is None or isinstance(value, SparseVec): return value dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 925d60f..1cc9669 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -7,22 +7,31 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') -cur.execute('DROP TABLE IF EXISTS items') -cur.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +cur.execute('DROP TABLE IF EXISTS psycopg2_items') +cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3))') register_vector(cur) class TestPsycopg2: def setup_method(self, test_method): - cur.execute('DELETE FROM items') + cur.execute('DELETE FROM psycopg2_items') - def test_works(self): + def test_vector(self): embedding = np.array([1.5, 2, 3]) - cur.execute('INSERT INTO items (embedding) VALUES (%s), (NULL)', (embedding,)) + cur.execute('INSERT INTO psycopg2_items (embedding) VALUES (%s), (NULL)', (embedding,)) - cur.execute('SELECT * FROM items ORDER BY id') + cur.execute('SELECT * FROM psycopg2_items ORDER BY id') res = cur.fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 assert res[1][1] is None + + def test_halfvec(self): + embedding = [1.5, 2, 3] + cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT id, half_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][1].to_list() == [1.5, 2, 3] + assert res[1][1] is None From 34c891e4131b13f434c3318d822288311a28c831 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:45:12 -0400 Subject: [PATCH 064/424] Added support for sparsevec type to Psycopg 2 [skip ci] --- CHANGELOG.md | 2 +- pgvector/psycopg2/__init__.py | 9 ++++++++- pgvector/psycopg2/sparsevec.py | 20 ++++++++++++++++++++ tests/test_psycopg2.py | 27 ++++++++++++++++++--------- 4 files changed, 47 insertions(+), 11 deletions(-) create mode 100644 pgvector/psycopg2/sparsevec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c93317..d4b1502 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ## 0.2.6 (unreleased) - Added support for `halfvec` and `sparsevec` types to Psycopg 3 -- Added support for `halfvec` type to Psycopg 2 +- Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg - Added support for `halfvec` and `sparsevec` types to Peewee - Added `L1Distance` for Django diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index bde04e5..16b7531 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,7 +1,8 @@ import psycopg2 from .halfvec import register_halfvec_info +from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import from_db, to_db +from ..utils import from_db, to_db, SparseVec __all__ = ['register_vector'] @@ -20,3 +21,9 @@ def register_vector(conn_or_curs=None): register_halfvec_info(cur.description[0][1]) except psycopg2.errors.UndefinedObject: pass + + try: + cur.execute('SELECT NULL::sparsevec') + register_sparsevec_info(cur.description[0][1]) + except psycopg2.errors.UndefinedObject: + pass diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py new file mode 100644 index 0000000..6603b2f --- /dev/null +++ b/pgvector/psycopg2/sparsevec.py @@ -0,0 +1,20 @@ +from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from ..utils import SparseVec + + +class SparsevecAdapter(object): + def __init__(self, value): + self._value = value + + def getquoted(self): + return adapt(SparseVec.to_db(self._value)).getquoted() + + +def cast_sparsevec(value, cur): + return SparseVec.from_db(value) + + +def register_sparsevec_info(oid): + sparsevec = new_type((oid,), 'SPARSEVEC', cast_sparsevec) + register_type(sparsevec) + register_adapter(SparseVec, SparsevecAdapter) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 1cc9669..b5ac50a 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg2 import register_vector +from pgvector.psycopg2 import register_vector, SparseVec import psycopg2 conn = psycopg2.connect(dbname='pgvector_python_test') @@ -8,7 +8,7 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') cur.execute('DROP TABLE IF EXISTS psycopg2_items') -cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3))') +cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), sparse_embedding sparsevec(3))') register_vector(cur) @@ -21,17 +21,26 @@ def test_vector(self): embedding = np.array([1.5, 2, 3]) cur.execute('INSERT INTO psycopg2_items (embedding) VALUES (%s), (NULL)', (embedding,)) - cur.execute('SELECT * FROM psycopg2_items ORDER BY id') + cur.execute('SELECT embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert np.array_equal(res[0][1], embedding) - assert res[0][1].dtype == np.float32 - assert res[1][1] is None + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None def test_halfvec(self): embedding = [1.5, 2, 3] cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) - cur.execute('SELECT id, half_embedding FROM psycopg2_items ORDER BY id') + cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert res[0][1].to_list() == [1.5, 2, 3] - assert res[1][1] is None + assert res[0][0].to_list() == [1.5, 2, 3] + assert res[1][0] is None + + def test_sparsevec(self): + embedding = SparseVec.from_dense([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (sparse_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0].to_dense() == [1.5, 2, 3] + assert res[1][0] is None From 57c1b3fca849c1c1ed781356e261dd248c024b06 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:46:05 -0400 Subject: [PATCH 065/424] Improved code [skip ci] --- pgvector/psycopg2/halfvec.py | 6 +++--- pgvector/psycopg2/vector.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index dc87ba1..8d974b3 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -3,11 +3,11 @@ class HalfvecAdapter(object): - def __init__(self, halfvec): - self._halfvec = halfvec + def __init__(self, value): + self._value = value def getquoted(self): - return adapt(HalfVec.to_db(self._halfvec)).getquoted() + return adapt(HalfVec.to_db(self._value)).getquoted() def cast_halfvec(value, cur): diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index f314948..8bcef06 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -4,11 +4,11 @@ class VectorAdapter(object): - def __init__(self, vector): - self._vector = vector + def __init__(self, value): + self._value = value def getquoted(self): - return adapt(to_db(self._vector)).getquoted() + return adapt(to_db(self._value)).getquoted() def cast_vector(value, cur): From 00805065e599dafb1ee0af11ec9dc0c9ee9e28a9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:48:16 -0400 Subject: [PATCH 066/424] Improved code [skip ci] --- pgvector/asyncpg/__init__.py | 6 +++--- pgvector/utils/vector.py | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index c8fc6b2..9c6b660 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,4 +1,4 @@ -from ..utils import from_db, from_db_binary, to_db, to_db_binary, HalfVec, SparseVec +from ..utils import Vector, HalfVec, SparseVec __all__ = ['register_vector'] @@ -6,8 +6,8 @@ async def register_vector(conn): await conn.set_type_codec( 'vector', - encoder=to_db_binary, - decoder=from_db_binary, + encoder=Vector.to_db_binary, + decoder=Vector.from_db_binary, format='binary' ) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index d831dac..bf3e165 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -47,3 +47,10 @@ def to_db_binary(value): raise ValueError('expected ndim to be 1') return pack('>HH', value.shape[0], 0) + value.tobytes() + + +class Vector: + from_db = from_db + from_db_binary = from_db_binary + to_db = to_db + to_db_binary = to_db_binary From 36e20b300795ea1909994c561969026f77e94aaa Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:54:28 -0400 Subject: [PATCH 067/424] Improved code [skip ci] --- pgvector/django/__init__.py | 10 ++--- pgvector/peewee/__init__.py | 2 +- pgvector/peewee/vector.py | 6 +-- pgvector/psycopg/__init__.py | 3 -- pgvector/psycopg/vector.py | 10 ++--- pgvector/psycopg2/__init__.py | 2 +- pgvector/psycopg2/vector.py | 6 +-- pgvector/sqlalchemy/vector.py | 8 ++-- pgvector/utils/vector.py | 69 +++++++++++++++-------------------- 9 files changed, 52 insertions(+), 64 deletions(-) diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index e4c8e21..abcf6dd 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -3,7 +3,7 @@ from django.db.models import Field, FloatField, Func, Value import numpy as np from .forms import VectorFormField -from ..utils import from_db, to_db +from ..utils import Vector __all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] @@ -34,15 +34,15 @@ def db_type(self, connection): return 'vector(%d)' % self.dimensions def from_db_value(self, value, expression, connection): - return from_db(value) + return Vector.from_db(value) def to_python(self, value): if isinstance(value, list): return np.array(value, dtype=np.float32) - return from_db(value) + return Vector.from_db(value) def get_prep_value(self, value): - return to_db(value) + return Vector.to_db(value) def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) @@ -111,7 +111,7 @@ class DistanceBase(Func): def __init__(self, expression, vector, **extra): if not hasattr(vector, 'resolve_expression'): - vector = Value(to_db(vector)) + vector = Value(Vector.to_db(vector)) super().__init__(expression, vector, **extra) diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 106170f..665c385 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,4 +1,4 @@ -from ..utils import from_db, to_db, SparseVec +from ..utils import SparseVec from .halfvec import HalfvecField from .sparsevec import SparsevecField from .vector import VectorField diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py index 756959d..3c11e74 100644 --- a/pgvector/peewee/vector.py +++ b/pgvector/peewee/vector.py @@ -1,5 +1,5 @@ from peewee import Expression, Field, Value -from ..utils import from_db, to_db +from ..utils import Vector class VectorField(Field): @@ -13,10 +13,10 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return to_db(value) + return Vector.to_db(value) def python_value(self, value): - return from_db(value) + return Vector.from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index 3228afc..b5df2d1 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -5,9 +5,6 @@ from .vector import register_vector_info from ..utils import HalfVec, SparseVec -# TODO remove in 0.3.0 -from .vector import * -from ..utils import from_db, from_db_binary, to_db, to_db_binary __all__ = ['register_vector'] diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index aa15b92..fe606a6 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -1,6 +1,6 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import from_db, from_db_binary, to_db, to_db_binary +from ..utils import Vector class VectorDumper(Dumper): @@ -8,7 +8,7 @@ class VectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return to_db(obj).encode('utf8') + return Vector.to_db(obj).encode('utf8') class VectorBinaryDumper(VectorDumper): @@ -16,7 +16,7 @@ class VectorBinaryDumper(VectorDumper): format = Format.BINARY def dump(self, obj): - return to_db_binary(obj) + return Vector.to_db_binary(obj) class VectorLoader(Loader): @@ -26,7 +26,7 @@ class VectorLoader(Loader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return from_db(data.decode('utf8')) + return Vector.from_db(data.decode('utf8')) class VectorBinaryLoader(VectorLoader): @@ -36,7 +36,7 @@ class VectorBinaryLoader(VectorLoader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return from_db_binary(data) + return Vector.from_db_binary(data) def register_vector_info(context, info): diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 16b7531..764c0f7 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -2,7 +2,7 @@ from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import from_db, to_db, SparseVec +from ..utils import SparseVec __all__ = ['register_vector'] diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 8bcef06..c0b066a 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -1,6 +1,6 @@ import numpy as np from psycopg2.extensions import adapt, new_type, register_adapter, register_type -from ..utils import from_db, to_db +from ..utils import Vector class VectorAdapter(object): @@ -8,11 +8,11 @@ def __init__(self, value): self._value = value def getquoted(self): - return adapt(to_db(self._value)).getquoted() + return adapt(Vector.to_db(self._value)).getquoted() def cast_vector(value, cur): - return from_db(value) + return Vector.from_db(value) def register_vector_info(oid): diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py index 01f1f24..056dcef 100644 --- a/pgvector/sqlalchemy/vector.py +++ b/pgvector/sqlalchemy/vector.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import from_db, to_db +from ..utils import Vector as Vec class Vector(UserDefinedType): @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return to_db(value, self.dim) + return Vec.to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(to_db(value, self.dim)) + return string_literal_processor(Vec.to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return from_db(value) + return Vec.from_db(value) return process class comparator_factory(UserDefinedType.Comparator): diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index bf3e165..c172b5e 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -2,55 +2,46 @@ from struct import pack, unpack_from -def from_db(value): - # could be ndarray if already cast by lower-level driver - if value is None or isinstance(value, np.ndarray): - return value - - return np.array(value[1:-1].split(','), dtype=np.float32) - - -def from_db_binary(value): - if value is None: - return value +class Vector: + def from_db(value): + # could be ndarray if already cast by lower-level driver + if value is None or isinstance(value, np.ndarray): + return value - dim, unused = unpack_from('>HH', value) - return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) + return np.array(value[1:-1].split(','), dtype=np.float32) + def from_db_binary(value): + if value is None or isinstance(value, np.ndarray): + return value -def to_db(value, dim=None): - if value is None: - return value + dim, unused = unpack_from('>HH', value) + return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) - if isinstance(value, np.ndarray): - if value.ndim != 1: - raise ValueError('expected ndim to be 1') + def to_db(value, dim=None): + if value is None: + return value - if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): - raise ValueError('dtype must be numeric') + if isinstance(value, np.ndarray): + if value.ndim != 1: + raise ValueError('expected ndim to be 1') - value = value.tolist() + if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): + raise ValueError('dtype must be numeric') - if dim is not None and len(value) != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + value = value.tolist() - return '[' + ','.join([str(float(v)) for v in value]) + ']' + if dim is not None and len(value) != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + return '[' + ','.join([str(float(v)) for v in value]) + ']' -def to_db_binary(value): - if value is None: - return value + def to_db_binary(value): + if value is None: + return value - value = np.asarray(value, dtype='>f') - - if value.ndim != 1: - raise ValueError('expected ndim to be 1') - - return pack('>HH', value.shape[0], 0) + value.tobytes() + value = np.asarray(value, dtype='>f') + if value.ndim != 1: + raise ValueError('expected ndim to be 1') -class Vector: - from_db = from_db - from_db_binary = from_db_binary - to_db = to_db - to_db_binary = to_db_binary + return pack('>HH', value.shape[0], 0) + value.tobytes() From 5061cae02c461cf11980a51110884b74d99e49cc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 17:58:13 -0400 Subject: [PATCH 068/424] Moved indexes to separate file for Django [skip ci] --- pgvector/django/__init__.py | 47 +------------------------------------ pgvector/django/indexes.py | 46 ++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 46 deletions(-) create mode 100644 pgvector/django/indexes.py diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index abcf6dd..04d013d 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,8 +1,8 @@ from django.contrib.postgres.operations import CreateExtension -from django.contrib.postgres.indexes import PostgresIndex from django.db.models import Field, FloatField, Func, Value import numpy as np from .forms import VectorFormField +from .indexes import IvfflatIndex, HnswIndex from ..utils import Vector __all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] @@ -61,51 +61,6 @@ def formfield(self, **kwargs): return super().formfield(form_class=VectorFormField, **kwargs) -class IvfflatIndex(PostgresIndex): - suffix = 'ivfflat' - - def __init__(self, *expressions, lists=None, **kwargs): - self.lists = lists - super().__init__(*expressions, **kwargs) - - def deconstruct(self): - path, args, kwargs = super().deconstruct() - if self.lists is not None: - kwargs['lists'] = self.lists - return path, args, kwargs - - def get_with_params(self): - with_params = [] - if self.lists is not None: - with_params.append('lists = %d' % self.lists) - return with_params - - -class HnswIndex(PostgresIndex): - suffix = 'hnsw' - - def __init__(self, *expressions, m=None, ef_construction=None, **kwargs): - self.m = m - self.ef_construction = ef_construction - super().__init__(*expressions, **kwargs) - - def deconstruct(self): - path, args, kwargs = super().deconstruct() - if self.m is not None: - kwargs['m'] = self.m - if self.ef_construction is not None: - kwargs['ef_construction'] = self.ef_construction - return path, args, kwargs - - def get_with_params(self): - with_params = [] - if self.m is not None: - with_params.append('m = %d' % self.m) - if self.ef_construction is not None: - with_params.append('ef_construction = %d' % self.ef_construction) - return with_params - - class DistanceBase(Func): output_field = FloatField() diff --git a/pgvector/django/indexes.py b/pgvector/django/indexes.py new file mode 100644 index 0000000..5bec0eb --- /dev/null +++ b/pgvector/django/indexes.py @@ -0,0 +1,46 @@ +from django.contrib.postgres.indexes import PostgresIndex + + +class IvfflatIndex(PostgresIndex): + suffix = 'ivfflat' + + def __init__(self, *expressions, lists=None, **kwargs): + self.lists = lists + super().__init__(*expressions, **kwargs) + + def deconstruct(self): + path, args, kwargs = super().deconstruct() + if self.lists is not None: + kwargs['lists'] = self.lists + return path, args, kwargs + + def get_with_params(self): + with_params = [] + if self.lists is not None: + with_params.append('lists = %d' % self.lists) + return with_params + + +class HnswIndex(PostgresIndex): + suffix = 'hnsw' + + def __init__(self, *expressions, m=None, ef_construction=None, **kwargs): + self.m = m + self.ef_construction = ef_construction + super().__init__(*expressions, **kwargs) + + def deconstruct(self): + path, args, kwargs = super().deconstruct() + if self.m is not None: + kwargs['m'] = self.m + if self.ef_construction is not None: + kwargs['ef_construction'] = self.ef_construction + return path, args, kwargs + + def get_with_params(self): + with_params = [] + if self.m is not None: + with_params.append('m = %d' % self.m) + if self.ef_construction is not None: + with_params.append('ef_construction = %d' % self.ef_construction) + return with_params From 3415860599853caad00243060412776e247d3746 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:00:52 -0400 Subject: [PATCH 069/424] Moved VectorField to separate file for Django [skip ci] --- pgvector/django/__init__.py | 50 +---------------------------------- pgvector/django/vector.py | 52 +++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 49 deletions(-) create mode 100644 pgvector/django/vector.py diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 04d013d..06adf01 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,8 +1,8 @@ from django.contrib.postgres.operations import CreateExtension from django.db.models import Field, FloatField, Func, Value import numpy as np -from .forms import VectorFormField from .indexes import IvfflatIndex, HnswIndex +from .vector import VectorField from ..utils import Vector __all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] @@ -13,54 +13,6 @@ def __init__(self): self.name = 'vector' -# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ -class VectorField(Field): - description = 'Vector' - empty_strings_allowed = False - - def __init__(self, *args, dimensions=None, **kwargs): - self.dimensions = dimensions - super().__init__(*args, **kwargs) - - def deconstruct(self): - name, path, args, kwargs = super().deconstruct() - if self.dimensions is not None: - kwargs['dimensions'] = self.dimensions - return name, path, args, kwargs - - def db_type(self, connection): - if self.dimensions is None: - return 'vector' - return 'vector(%d)' % self.dimensions - - def from_db_value(self, value, expression, connection): - return Vector.from_db(value) - - def to_python(self, value): - if isinstance(value, list): - return np.array(value, dtype=np.float32) - return Vector.from_db(value) - - def get_prep_value(self, value): - return Vector.to_db(value) - - def value_to_string(self, obj): - return self.get_prep_value(self.value_from_object(obj)) - - def validate(self, value, model_instance): - if isinstance(value, np.ndarray): - value = value.tolist() - super().validate(value, model_instance) - - def run_validators(self, value): - if isinstance(value, np.ndarray): - value = value.tolist() - super().run_validators(value) - - def formfield(self, **kwargs): - return super().formfield(form_class=VectorFormField, **kwargs) - - class DistanceBase(Func): output_field = FloatField() diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py new file mode 100644 index 0000000..05789fa --- /dev/null +++ b/pgvector/django/vector.py @@ -0,0 +1,52 @@ +from django.db.models import Field +import numpy as np +from .forms import VectorFormField +from ..utils import Vector + + +# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ +class VectorField(Field): + description = 'Vector' + empty_strings_allowed = False + + def __init__(self, *args, dimensions=None, **kwargs): + self.dimensions = dimensions + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.dimensions is not None: + kwargs['dimensions'] = self.dimensions + return name, path, args, kwargs + + def db_type(self, connection): + if self.dimensions is None: + return 'vector' + return 'vector(%d)' % self.dimensions + + def from_db_value(self, value, expression, connection): + return Vector.from_db(value) + + def to_python(self, value): + if isinstance(value, list): + return np.array(value, dtype=np.float32) + return Vector.from_db(value) + + def get_prep_value(self, value): + return Vector.to_db(value) + + def value_to_string(self, obj): + return self.get_prep_value(self.value_from_object(obj)) + + def validate(self, value, model_instance): + if isinstance(value, np.ndarray): + value = value.tolist() + super().validate(value, model_instance) + + def run_validators(self, value): + if isinstance(value, np.ndarray): + value = value.tolist() + super().run_validators(value) + + def formfield(self, **kwargs): + return super().formfield(form_class=VectorFormField, **kwargs) From 18d1b65687e868eeb1959864c70b344d485798e2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:04:03 -0400 Subject: [PATCH 070/424] Improved code [skip ci] --- pgvector/django/__init__.py | 33 +-------------------------------- pgvector/django/forms.py | 12 ------------ pgvector/django/functions.py | 30 ++++++++++++++++++++++++++++++ pgvector/django/vector.py | 18 +++++++++++++++++- pgvector/django/widgets.py | 9 --------- 5 files changed, 48 insertions(+), 54 deletions(-) delete mode 100644 pgvector/django/forms.py create mode 100644 pgvector/django/functions.py delete mode 100644 pgvector/django/widgets.py diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 06adf01..a32d329 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,9 +1,7 @@ from django.contrib.postgres.operations import CreateExtension -from django.db.models import Field, FloatField, Func, Value -import numpy as np +from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance from .indexes import IvfflatIndex, HnswIndex from .vector import VectorField -from ..utils import Vector __all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] @@ -11,32 +9,3 @@ class VectorExtension(CreateExtension): def __init__(self): self.name = 'vector' - - -class DistanceBase(Func): - output_field = FloatField() - - def __init__(self, expression, vector, **extra): - if not hasattr(vector, 'resolve_expression'): - vector = Value(Vector.to_db(vector)) - super().__init__(expression, vector, **extra) - - -class L2Distance(DistanceBase): - function = '' - arg_joiner = ' <-> ' - - -class MaxInnerProduct(DistanceBase): - function = '' - arg_joiner = ' <#> ' - - -class CosineDistance(DistanceBase): - function = '' - arg_joiner = ' <=> ' - - -class L1Distance(DistanceBase): - function = '' - arg_joiner = ' <+> ' diff --git a/pgvector/django/forms.py b/pgvector/django/forms.py deleted file mode 100644 index 3748236..0000000 --- a/pgvector/django/forms.py +++ /dev/null @@ -1,12 +0,0 @@ -from django import forms -import numpy as np -from .widgets import VectorWidget - - -class VectorFormField(forms.CharField): - widget = VectorWidget - - def has_changed(self, initial, data): - if isinstance(initial, np.ndarray): - initial = initial.tolist() - return super().has_changed(initial, data) diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py new file mode 100644 index 0000000..2638d67 --- /dev/null +++ b/pgvector/django/functions.py @@ -0,0 +1,30 @@ +from django.db.models import FloatField, Func, Value +from ..utils import Vector + +class DistanceBase(Func): + output_field = FloatField() + + def __init__(self, expression, vector, **extra): + if not hasattr(vector, 'resolve_expression'): + vector = Value(Vector.to_db(vector)) + super().__init__(expression, vector, **extra) + + +class L2Distance(DistanceBase): + function = '' + arg_joiner = ' <-> ' + + +class MaxInnerProduct(DistanceBase): + function = '' + arg_joiner = ' <#> ' + + +class CosineDistance(DistanceBase): + function = '' + arg_joiner = ' <=> ' + + +class L1Distance(DistanceBase): + function = '' + arg_joiner = ' <+> ' diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py index 05789fa..063b5ff 100644 --- a/pgvector/django/vector.py +++ b/pgvector/django/vector.py @@ -1,6 +1,6 @@ +from django import forms from django.db.models import Field import numpy as np -from .forms import VectorFormField from ..utils import Vector @@ -50,3 +50,19 @@ def run_validators(self, value): def formfield(self, **kwargs): return super().formfield(form_class=VectorFormField, **kwargs) + + +class VectorWidget(forms.TextInput): + def format_value(self, value): + if isinstance(value, np.ndarray): + value = value.tolist() + return super().format_value(value) + + +class VectorFormField(forms.CharField): + widget = VectorWidget + + def has_changed(self, initial, data): + if isinstance(initial, np.ndarray): + initial = initial.tolist() + return super().has_changed(initial, data) diff --git a/pgvector/django/widgets.py b/pgvector/django/widgets.py deleted file mode 100644 index 731d632..0000000 --- a/pgvector/django/widgets.py +++ /dev/null @@ -1,9 +0,0 @@ -from django import forms -import numpy as np - - -class VectorWidget(forms.TextInput): - def format_value(self, value): - if isinstance(value, np.ndarray): - value = value.tolist() - return super().format_value(value) From 7dbef6894d583901ab7a7b9135f7aea043b0792b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:06:06 -0400 Subject: [PATCH 071/424] Moved VectorExtension to separate file for Django [skip ci] --- pgvector/django/__init__.py | 7 +------ pgvector/django/extensions.py | 6 ++++++ pgvector/django/functions.py | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 pgvector/django/extensions.py diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index a32d329..051090a 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,11 +1,6 @@ -from django.contrib.postgres.operations import CreateExtension +from .extensions import VectorExtension from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance from .indexes import IvfflatIndex, HnswIndex from .vector import VectorField __all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] - - -class VectorExtension(CreateExtension): - def __init__(self): - self.name = 'vector' diff --git a/pgvector/django/extensions.py b/pgvector/django/extensions.py new file mode 100644 index 0000000..0573f72 --- /dev/null +++ b/pgvector/django/extensions.py @@ -0,0 +1,6 @@ +from django.contrib.postgres.operations import CreateExtension + + +class VectorExtension(CreateExtension): + def __init__(self): + self.name = 'vector' diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index 2638d67..4f696ff 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -1,6 +1,7 @@ from django.db.models import FloatField, Func, Value from ..utils import Vector + class DistanceBase(Func): output_field = FloatField() From 2d49011020173ca4f5e41514c4b26a81672d00b2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:12:55 -0400 Subject: [PATCH 072/424] Added .pytest_cache to .gitignore [skip ci] --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1d3b727..f7ff659 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ venv/ .cache/ *.pyc __pycache__ +.pytest_cache/ From 828508ae6785400370263755f8ec01bc6b07e26a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:19:21 -0400 Subject: [PATCH 073/424] Added support for halfvec type to Django [skip ci] --- CHANGELOG.md | 1 + pgvector/django/__init__.py | 3 ++- pgvector/django/halfvec.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_django.py | 17 +++++++++++++---- 4 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 pgvector/django/halfvec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d4b1502..792dbab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.2.6 (unreleased) +- Added support for `halfvec` type to Django - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 051090a..5530c52 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,6 +1,7 @@ from .extensions import VectorExtension from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance +from .halfvec import HalfvecField from .indexes import IvfflatIndex, HnswIndex from .vector import VectorField -__all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] +__all__ = ['VectorExtension', 'VectorField', 'HalfvecField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py new file mode 100644 index 0000000..ec08a42 --- /dev/null +++ b/pgvector/django/halfvec.py @@ -0,0 +1,35 @@ +from django.db.models import Field +from ..utils import HalfVec + + +# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ +class HalfvecField(Field): + description = 'Halfvec' + empty_strings_allowed = False + + def __init__(self, *args, dimensions=None, **kwargs): + self.dimensions = dimensions + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.dimensions is not None: + kwargs['dimensions'] = self.dimensions + return name, path, args, kwargs + + def db_type(self, connection): + if self.dimensions is None: + return 'halfvec' + return 'halfvec(%d)' % self.dimensions + + def from_db_value(self, value, expression, connection): + return HalfVec.from_db(value) + + def to_python(self, value): + return HalfVec.from_db(value) + + def get_prep_value(self, value): + return HalfVec.to_db(value) + + def value_to_string(self, obj): + return self.get_prep_value(self.value_from_object(obj)) diff --git a/tests/test_django.py b/tests/test_django.py index b6274cd..b181d5c 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance +from pgvector.django import VectorExtension, VectorField, HalfvecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance from unittest import mock settings.configure( @@ -23,7 +23,8 @@ class Item(models.Model): - embedding = VectorField(dimensions=3) + embedding = VectorField(dimensions=3, null=True, blank=True) + half_embedding = HalfvecField(dimensions=3, null=True, blank=True) class Meta: app_label = 'myapp' @@ -56,7 +57,8 @@ class Migration(migrations.Migration): name='Item', fields=[ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('embedding', pgvector.django.VectorField(dimensions=3, null=True)), + ('embedding', pgvector.django.VectorField(dimensions=3, null=True, blank=True)), + ('half_embedding', pgvector.django.HalfvecField(dimensions=3, null=True, blank=True)), ], ), migrations.AddIndex( @@ -102,7 +104,7 @@ class TestDjango: def setup_method(self, test_method): Item.objects.all().delete() - def test_works(self): + def test_vector(self): item = Item(id=1, embedding=[1, 2, 3]) item.save() item = Item.objects.get(pk=1) @@ -110,6 +112,13 @@ def test_works(self): assert np.array_equal(item.embedding, np.array([1, 2, 3])) assert item.embedding.dtype == np.float32 + def test_halfvec(self): + item = Item(id=1, half_embedding=[1, 2, 3]) + item.save() + item = Item.objects.get(pk=1) + assert item.id == 1 + assert item.half_embedding.to_list() == [1, 2, 3] + def test_l2_distance(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) From 70c1f5813db4f0c312f39cac8a546c65e520c5ef Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:23:36 -0400 Subject: [PATCH 074/424] Added support for sparsevec type to Django [skip ci] --- CHANGELOG.md | 2 +- pgvector/django/__init__.py | 4 +++- pgvector/django/sparsevec.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_django.py | 11 ++++++++++- 4 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 pgvector/django/sparsevec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 792dbab..d5c9642 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## 0.2.6 (unreleased) -- Added support for `halfvec` type to Django +- Added support for `halfvec` and `sparsevec` types to Django - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 5530c52..2281ed9 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -2,6 +2,8 @@ from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance from .halfvec import HalfvecField from .indexes import IvfflatIndex, HnswIndex +from .sparsevec import SparsevecField from .vector import VectorField +from ..utils import SparseVec -__all__ = ['VectorExtension', 'VectorField', 'HalfvecField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] +__all__ = ['VectorExtension', 'VectorField', 'HalfvecField', 'SparsevecField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py new file mode 100644 index 0000000..91b9cf8 --- /dev/null +++ b/pgvector/django/sparsevec.py @@ -0,0 +1,35 @@ +from django.db.models import Field +from ..utils import SparseVec + + +# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ +class SparsevecField(Field): + description = 'Sparsevec' + empty_strings_allowed = False + + def __init__(self, *args, dimensions=None, **kwargs): + self.dimensions = dimensions + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.dimensions is not None: + kwargs['dimensions'] = self.dimensions + return name, path, args, kwargs + + def db_type(self, connection): + if self.dimensions is None: + return 'sparsevec' + return 'sparsevec(%d)' % self.dimensions + + def from_db_value(self, value, expression, connection): + return SparseVec.from_db(value) + + def to_python(self, value): + return SparseVec.from_db(value) + + def get_prep_value(self, value): + return SparseVec.to_db(value) + + def value_to_string(self, obj): + return self.get_prep_value(self.value_from_object(obj)) diff --git a/tests/test_django.py b/tests/test_django.py index b181d5c..271c2cc 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfvecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance +from pgvector.django import VectorExtension, VectorField, HalfvecField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, SparseVec from unittest import mock settings.configure( @@ -25,6 +25,7 @@ class Item(models.Model): embedding = VectorField(dimensions=3, null=True, blank=True) half_embedding = HalfvecField(dimensions=3, null=True, blank=True) + sparse_embedding = SparsevecField(dimensions=3, null=True, blank=True) class Meta: app_label = 'myapp' @@ -59,6 +60,7 @@ class Migration(migrations.Migration): ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('embedding', pgvector.django.VectorField(dimensions=3, null=True, blank=True)), ('half_embedding', pgvector.django.HalfvecField(dimensions=3, null=True, blank=True)), + ('sparse_embedding', pgvector.django.SparsevecField(dimensions=3, null=True, blank=True)), ], ), migrations.AddIndex( @@ -119,6 +121,13 @@ def test_halfvec(self): assert item.id == 1 assert item.half_embedding.to_list() == [1, 2, 3] + def test_sparsevec(self): + item = Item(id=1, sparse_embedding=SparseVec.from_dense([1, 2, 3])) + item.save() + item = Item.objects.get(pk=1) + assert item.id == 1 + assert item.sparse_embedding.to_dense() == [1, 2, 3] + def test_l2_distance(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) From b4b6c6b9f53a6574bb08773677e10472614f12b0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:28:59 -0400 Subject: [PATCH 075/424] Added support for halfvec type to SQLAlchemy and SQLModel [skip ci] --- CHANGELOG.md | 1 + pgvector/sqlalchemy/__init__.py | 3 +- pgvector/sqlalchemy/halfvec.py | 51 +++++++++++++++++++++++++++++++++ pgvector/utils/halfvec.py | 6 +++- tests/test_sqlalchemy.py | 5 ++-- 5 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 pgvector/sqlalchemy/halfvec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d5c9642..1331f28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.6 (unreleased) - Added support for `halfvec` and `sparsevec` types to Django +- Added support for `halfvec` type to SQLAlchemy and SQLModel - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 65193c6..d4c0179 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,3 +1,4 @@ +from .halfvec import Halfvec from .vector import Vector -__all__ = ['Vector'] +__all__ = ['Vector', 'Halfvec'] diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py new file mode 100644 index 0000000..694d14f --- /dev/null +++ b/pgvector/sqlalchemy/halfvec.py @@ -0,0 +1,51 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float, String +from ..utils import HalfVec + + +class Halfvec(UserDefinedType): + cache_ok = True + _string = String() + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + if self.dim is None: + return 'HALFVEC' + return 'HALFVEC(%d)' % self.dim + + def bind_processor(self, dialect): + def process(value): + return HalfVec.to_db(value, self.dim) + return process + + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + + def process(value): + return string_literal_processor(HalfVec.to_db(value, self.dim)) + return process + + def result_processor(self, dialect, coltype): + def process(value): + return HalfVec.from_db(value) + return process + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op('<->', return_type=Float)(other) + + def max_inner_product(self, other): + return self.op('<#>', return_type=Float)(other) + + def cosine_distance(self, other): + return self.op('<=>', return_type=Float)(other) + + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + + +# for reflection +ischema_names['halfvec'] = Halfvec diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 627c1c6..8b760cd 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -12,11 +12,15 @@ def __init__(self, value): def to_list(self): return list(self.value) - def to_db(value): + def to_db(value, dim=None): if value is None: return value if isinstance(value, HalfVec): value = value.value + + if dim is not None and len(value) != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + return '[' + ','.join([str(float(v)) for v in value]) + ']' def to_db_binary(value): diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 5db608f..9d5d168 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import Vector, Halfvec import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError @@ -20,6 +20,7 @@ class Item(Base): id = mapped_column(Integer, primary_key=True) embedding = mapped_column(Vector(3)) + half_embedding = mapped_column(Halfvec(3)) Base.metadata.drop_all(engine) @@ -43,7 +44,7 @@ def create_items(): ] session = Session(engine) for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v)) + session.add(Item(id=i + 1, embedding=v, half_embedding=v)) session.commit() From f1cf31e0f7ae7b3106f85cd06ffdc8725fc89e74 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:32:32 -0400 Subject: [PATCH 076/424] Added support for sparsevec type to SQLAlchemy and SQLModel [skip ci] --- CHANGELOG.md | 4 +-- pgvector/sqlalchemy/__init__.py | 4 ++- pgvector/sqlalchemy/sparsevec.py | 51 ++++++++++++++++++++++++++++++++ pgvector/utils/sparsevec.py | 6 +++- tests/test_sqlalchemy.py | 5 ++-- 5 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 pgvector/sqlalchemy/sparsevec.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1331f28..eb6fab7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ -## 0.2.6 (unreleased) +## 0.3.0 (unreleased) - Added support for `halfvec` and `sparsevec` types to Django -- Added support for `halfvec` type to SQLAlchemy and SQLModel +- Added support for `halfvec` and `sparsevec` types to SQLAlchemy and SQLModel - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index d4c0179..a7db132 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,4 +1,6 @@ from .halfvec import Halfvec +from .sparsevec import Sparsevec from .vector import Vector +from ..utils import SparseVec -__all__ = ['Vector', 'Halfvec'] +__all__ = ['Vector', 'Halfvec', 'Sparsevec'] diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py new file mode 100644 index 0000000..6e3c691 --- /dev/null +++ b/pgvector/sqlalchemy/sparsevec.py @@ -0,0 +1,51 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float, String +from ..utils import SparseVec + + +class Sparsevec(UserDefinedType): + cache_ok = True + _string = String() + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + if self.dim is None: + return 'SPARSEVEC' + return 'SPARSEVEC(%d)' % self.dim + + def bind_processor(self, dialect): + def process(value): + return SparseVec.to_db(value, self.dim) + return process + + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + + def process(value): + return string_literal_processor(SparseVec.to_db(value, self.dim)) + return process + + def result_processor(self, dialect, coltype): + def process(value): + return SparseVec.from_db(value) + return process + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op('<->', return_type=Float)(other) + + def max_inner_product(self, other): + return self.op('<#>', return_type=Float)(other) + + def cosine_distance(self, other): + return self.op('<=>', return_type=Float)(other) + + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + + +# for reflection +ischema_names['sparsevec'] = Sparsevec diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index f7f6607..9aae64e 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -19,9 +19,13 @@ def to_dense(self): vec[i] = v return vec - def to_db(value): + def to_db(value, dim=None): if value is None: return value + + if dim is not None and value.dim != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(value.indices, value.values)]) + '}/' + str(value.dim) def to_db_binary(value): diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 9d5d168..e6b869a 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec +from pgvector.sqlalchemy import Vector, Halfvec, Sparsevec, SparseVec import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError @@ -21,6 +21,7 @@ class Item(Base): id = mapped_column(Integer, primary_key=True) embedding = mapped_column(Vector(3)) half_embedding = mapped_column(Halfvec(3)) + sparse_embedding = mapped_column(Sparsevec(3)) Base.metadata.drop_all(engine) @@ -44,7 +45,7 @@ def create_items(): ] session = Session(engine) for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v, half_embedding=v)) + session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v))) session.commit() From 19054225385229ddf54f579010d509f608e10fbb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:33:52 -0400 Subject: [PATCH 077/424] Added new types to SQLModel tests [skip ci] --- tests/test_sqlmodel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 8d349ea..2fe2930 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import Vector, Halfvec, Sparsevec, SparseVec import pytest from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError @@ -17,6 +17,8 @@ class Item(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Vector(3))) + half_embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Halfvec(3))) + sparse_embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Sparsevec(3))) SQLModel.metadata.drop_all(engine) @@ -40,7 +42,7 @@ def create_items(): ] session = Session(engine) for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v)) + session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v))) session.commit() From c662eac0f5f0a112a6712293bd1b9d3676ea391c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:47:03 -0400 Subject: [PATCH 078/424] Added support for sparsevec to distance functions in Django [skip ci] --- pgvector/django/functions.py | 9 +++++++-- tests/test_django.py | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index 4f696ff..fff2735 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -1,5 +1,5 @@ from django.db.models import FloatField, Func, Value -from ..utils import Vector +from ..utils import Vector, HalfVec, SparseVec class DistanceBase(Func): @@ -7,7 +7,12 @@ class DistanceBase(Func): def __init__(self, expression, vector, **extra): if not hasattr(vector, 'resolve_expression'): - vector = Value(Vector.to_db(vector)) + if isinstance(vector, HalfVec): + vector = Value(HalfVec.to_db(vector)) + elif isinstance(vector, SparseVec): + vector = Value(SparseVec.to_db(vector)) + else: + vector = Value(Vector.to_db(vector)) super().__init__(expression, vector, **extra) diff --git a/tests/test_django.py b/tests/test_django.py index 271c2cc..1c33b43 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -92,7 +92,7 @@ def create_items(): [1, 1, 2] ] for i, v in enumerate(vectors): - item = Item(id=i + 1, embedding=v) + item = Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v)) item.save() @@ -156,6 +156,20 @@ def test_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_halfvec_l2_distance(self): + create_items() + distance = L2Distance('half_embedding', [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + + def test_sparsevec_l2_distance(self): + create_items() + distance = L2Distance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_filter(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) From 82c976f96c1bbfdbbb78cdc2a014e229b9e51b2d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:48:45 -0400 Subject: [PATCH 079/424] Improved code and test [skip ci] --- pgvector/django/__init__.py | 17 +++++++++++++++-- tests/test_django.py | 4 ++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 2281ed9..f03fad3 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -4,6 +4,19 @@ from .indexes import IvfflatIndex, HnswIndex from .sparsevec import SparsevecField from .vector import VectorField -from ..utils import SparseVec +from ..utils import HalfVec, SparseVec -__all__ = ['VectorExtension', 'VectorField', 'HalfvecField', 'SparsevecField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance', 'L1Distance'] +__all__ = [ + 'VectorExtension', + 'VectorField', + 'HalfvecField', + 'SparsevecField', + 'IvfflatIndex', + 'HnswIndex', + 'L2Distance', + 'MaxInnerProduct', + 'CosineDistance', + 'L1Distance', + 'HalfVec', + 'SparseVec' +] diff --git a/tests/test_django.py b/tests/test_django.py index 1c33b43..98e4adf 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfvecField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, SparseVec +from pgvector.django import VectorExtension, VectorField, HalfvecField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HalfVec, SparseVec from unittest import mock settings.configure( @@ -158,7 +158,7 @@ def test_l1_distance(self): def test_halfvec_l2_distance(self): create_items() - distance = L2Distance('half_embedding', [1, 1, 1]) + distance = L2Distance('half_embedding', HalfVec([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] From f399d3f99519acd489e466af76fd241698594b83 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:53:21 -0400 Subject: [PATCH 080/424] Improved typing in example and test [skip ci] --- README.md | 2 +- tests/test_sqlmodel.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 19acac5..28703a6 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ from pgvector.sqlalchemy import Vector from sqlalchemy import Column class Item(SQLModel, table=True): - embedding: List[float] = Field(sa_column=Column(Vector(3))) + embedding: Any = Field(sa_column=Column(Vector(3))) ``` Insert a vector diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 2fe2930..777b6bc 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -5,7 +5,7 @@ from sqlalchemy.exc import StatementError from sqlalchemy.sql import func from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text -from typing import List, Optional +from typing import Any, List, Optional engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') with Session(engine) as session: @@ -16,9 +16,9 @@ class Item(SQLModel, table=True): __tablename__ = 'sqlmodel_item' id: Optional[int] = Field(default=None, primary_key=True) - embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Vector(3))) - half_embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Halfvec(3))) - sparse_embedding: Optional[List[float]] = Field(default=None, sa_column=Column(Sparsevec(3))) + embedding: Optional[Any] = Field(default=None, sa_column=Column(Vector(3))) + half_embedding: Optional[Any] = Field(default=None, sa_column=Column(Halfvec(3))) + sparse_embedding: Optional[Any] = Field(default=None, sa_column=Column(Sparsevec(3))) SQLModel.metadata.drop_all(engine) From 66af6db211f695271f95f5868b1ed86d15f0290c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 18:53:50 -0400 Subject: [PATCH 081/424] Removed unused import [skip ci] --- tests/test_sqlmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 777b6bc..2c81715 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -5,7 +5,7 @@ from sqlalchemy.exc import StatementError from sqlalchemy.sql import func from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text -from typing import Any, List, Optional +from typing import Any, Optional engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') with Session(engine) as session: From 91900d6150480b36806ebc5a83391524a1d9d8df Mon Sep 17 00:00:00 2001 From: Cdingram Date: Thu, 16 May 2024 17:19:48 -0600 Subject: [PATCH 082/424] Fix for blank django form fields when VectorField is not required (#68) --- pgvector/django/vector.py | 5 +++++ tests/test_django.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py index 063b5ff..f6ef0f2 100644 --- a/pgvector/django/vector.py +++ b/pgvector/django/vector.py @@ -66,3 +66,8 @@ def has_changed(self, initial, data): if isinstance(initial, np.ndarray): initial = initial.tolist() return super().has_changed(initial, data) + + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/tests/test_django.py b/tests/test_django.py index 98e4adf..24be5f9 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -222,6 +222,14 @@ def test_form_save(self): assert form.save() assert [4, 5, 6] == Item.objects.get(pk=1).embedding.tolist() + def test_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = ItemForm(instance=item, data={'embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).embedding is None + def test_clean(self): item = Item(id=1, embedding=[1, 2, 3]) item.full_clean() From 9a74ba38eedf03c373474cc371df957724dcbc5f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 19:21:04 -0400 Subject: [PATCH 083/424] Improved field descriptions for Django [skip ci] --- pgvector/django/halfvec.py | 2 +- pgvector/django/sparsevec.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index ec08a42..79103ad 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -4,7 +4,7 @@ # https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ class HalfvecField(Field): - description = 'Halfvec' + description = 'Half vector' empty_strings_allowed = False def __init__(self, *args, dimensions=None, **kwargs): diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index 91b9cf8..1f39981 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -4,7 +4,7 @@ # https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ class SparsevecField(Field): - description = 'Sparsevec' + description = 'Sparse vector' empty_strings_allowed = False def __init__(self, *args, dimensions=None, **kwargs): From e583e442539b4185960d99231bdb14c127c988db Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 19:25:35 -0400 Subject: [PATCH 084/424] Updated URLs [skip ci] --- pgvector/django/halfvec.py | 2 +- pgvector/django/sparsevec.py | 2 +- pgvector/django/vector.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 79103ad..2fa9e52 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -2,7 +2,7 @@ from ..utils import HalfVec -# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ class HalfvecField(Field): description = 'Half vector' empty_strings_allowed = False diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index 1f39981..b7f4c79 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -2,7 +2,7 @@ from ..utils import SparseVec -# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ class SparsevecField(Field): description = 'Sparse vector' empty_strings_allowed = False diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py index f6ef0f2..30fd99f 100644 --- a/pgvector/django/vector.py +++ b/pgvector/django/vector.py @@ -4,7 +4,7 @@ from ..utils import Vector -# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ class VectorField(Field): description = 'Vector' empty_strings_allowed = False From bf53f41e8caf077e2dab65eb3e20c186f9794994 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 19:38:05 -0400 Subject: [PATCH 085/424] Added support for bit type to Django [skip ci] --- CHANGELOG.md | 4 ++-- pgvector/django/__init__.py | 6 +++++- pgvector/django/bit.py | 21 +++++++++++++++++++++ pgvector/django/functions.py | 19 +++++++++++++++++++ tests/test_django.py | 22 +++++++++++++++++++++- 5 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 pgvector/django/bit.py diff --git a/CHANGELOG.md b/CHANGELOG.md index eb6fab7..69be6fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,12 @@ ## 0.3.0 (unreleased) -- Added support for `halfvec` and `sparsevec` types to Django +- Added support for `halfvec`, `bit`, and `sparsevec` types to Django - Added support for `halfvec` and `sparsevec` types to SQLAlchemy and SQLModel - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg - Added support for `halfvec` and `sparsevec` types to Peewee -- Added `L1Distance` for Django +- Added `L1Distance`, `HammingDistance`, and `JaccardDistance` for Django - Added `l1_distance` for SQLAlchemy, SQLModel, and Peewee ## 0.2.5 (2024-02-07) diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index f03fad3..677d77e 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,5 +1,6 @@ +from .bit import BitField from .extensions import VectorExtension -from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance +from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance from .halfvec import HalfvecField from .indexes import IvfflatIndex, HnswIndex from .sparsevec import SparsevecField @@ -10,6 +11,7 @@ 'VectorExtension', 'VectorField', 'HalfvecField', + 'BitField', 'SparsevecField', 'IvfflatIndex', 'HnswIndex', @@ -17,6 +19,8 @@ 'MaxInnerProduct', 'CosineDistance', 'L1Distance', + 'HammingDistance', + 'JaccardDistance', 'HalfVec', 'SparseVec' ] diff --git a/pgvector/django/bit.py b/pgvector/django/bit.py new file mode 100644 index 0000000..941d694 --- /dev/null +++ b/pgvector/django/bit.py @@ -0,0 +1,21 @@ +from django.db.models import Field + + +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ +class BitField(Field): + description = 'Bit string' + + def __init__(self, *args, length=None, **kwargs): + self.length = length + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.length is not None: + kwargs['length'] = self.length + return name, path, args, kwargs + + def db_type(self, connection): + if self.length is None: + return 'bit' + return 'bit(%d)' % self.length diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index fff2735..fe6f2c5 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -16,6 +16,15 @@ def __init__(self, expression, vector, **extra): super().__init__(expression, vector, **extra) +class BitDistanceBase(Func): + output_field = FloatField() + + def __init__(self, expression, vector, **extra): + if not hasattr(vector, 'resolve_expression'): + vector = Value(vector) + super().__init__(expression, vector, **extra) + + class L2Distance(DistanceBase): function = '' arg_joiner = ' <-> ' @@ -34,3 +43,13 @@ class CosineDistance(DistanceBase): class L1Distance(DistanceBase): function = '' arg_joiner = ' <+> ' + + +class HammingDistance(BitDistanceBase): + function = '' + arg_joiner = ' <~> ' + + +class JaccardDistance(BitDistanceBase): + function = '' + arg_joiner = ' <%%> ' diff --git a/tests/test_django.py b/tests/test_django.py index 24be5f9..6dc91e0 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfvecField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HalfVec, SparseVec +from pgvector.django import VectorExtension, VectorField, HalfvecField, BitField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVec, SparseVec from unittest import mock settings.configure( @@ -25,6 +25,7 @@ class Item(models.Model): embedding = VectorField(dimensions=3, null=True, blank=True) half_embedding = HalfvecField(dimensions=3, null=True, blank=True) + binary_embedding = BitField(length=3, null=True, blank=True) sparse_embedding = SparsevecField(dimensions=3, null=True, blank=True) class Meta: @@ -60,6 +61,7 @@ class Migration(migrations.Migration): ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('embedding', pgvector.django.VectorField(dimensions=3, null=True, blank=True)), ('half_embedding', pgvector.django.HalfvecField(dimensions=3, null=True, blank=True)), + ('binary_embedding', pgvector.django.BitField(length=3, null=True, blank=True)), ('sparse_embedding', pgvector.django.SparsevecField(dimensions=3, null=True, blank=True)), ], ), @@ -170,6 +172,24 @@ def test_sparsevec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_bit_hamming_distance(self): + Item(id=1, binary_embedding='000').save() + Item(id=2, binary_embedding='101').save() + Item(id=3, binary_embedding='111').save() + distance = HammingDistance('binary_embedding', '101') + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [0, 1, 2] + + def test_bit_jaccard_distance(self): + Item(id=1, binary_embedding='000').save() + Item(id=2, binary_embedding='101').save() + Item(id=3, binary_embedding='111').save() + distance = JaccardDistance('binary_embedding', '101') + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + # assert [v.distance for v in items] == [0, 1/3, 1] + def test_filter(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) From 054eb4aed441d2752497531da08f668cf1c32325 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 19:49:48 -0400 Subject: [PATCH 086/424] Added support for bit type to SQLAlchemy and SQLModel [skip ci] --- CHANGELOG.md | 5 +++-- pgvector/sqlalchemy/__init__.py | 3 ++- pgvector/sqlalchemy/bit.py | 26 ++++++++++++++++++++++++++ tests/test_sqlalchemy.py | 28 ++++++++++++++++++++++++++-- 4 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 pgvector/sqlalchemy/bit.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 69be6fd..74a0f2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,14 @@ ## 0.3.0 (unreleased) - Added support for `halfvec`, `bit`, and `sparsevec` types to Django -- Added support for `halfvec` and `sparsevec` types to SQLAlchemy and SQLModel +- Added support for `halfvec`, `bit`, and `sparsevec` types to SQLAlchemy and SQLModel - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg - Added support for `halfvec` and `sparsevec` types to Peewee - Added `L1Distance`, `HammingDistance`, and `JaccardDistance` for Django -- Added `l1_distance` for SQLAlchemy, SQLModel, and Peewee +- Added `l1_distance`, `hamming_distance`, and `jaccard_distance` for SQLAlchemy and SQLModel +- Added `l1_distance` for Peewee ## 0.2.5 (2024-02-07) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index a7db132..6e5a206 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,6 +1,7 @@ +from .bit import Bit from .halfvec import Halfvec from .sparsevec import Sparsevec from .vector import Vector from ..utils import SparseVec -__all__ = ['Vector', 'Halfvec', 'Sparsevec'] +__all__ = ['Vector', 'Halfvec', 'Bit', 'Sparsevec'] diff --git a/pgvector/sqlalchemy/bit.py b/pgvector/sqlalchemy/bit.py new file mode 100644 index 0000000..f71c1d0 --- /dev/null +++ b/pgvector/sqlalchemy/bit.py @@ -0,0 +1,26 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float + + +class Bit(UserDefinedType): + cache_ok = True + + def __init__(self, length=None): + super(UserDefinedType, self).__init__() + self.length = length + + def get_col_spec(self, **kw): + if self.length is None: + return 'BIT' + return 'BIT(%d)' % self.length + + class comparator_factory(UserDefinedType.Comparator): + def hamming_distance(self, other): + return self.op('<~>', return_type=Float)(other) + + def jaccard_distance(self, other): + return self.op('<%>', return_type=Float)(other) + + +# for reflection +ischema_names['bit'] = Bit diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index e6b869a..0003be4 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec, Sparsevec, SparseVec +from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVec import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError @@ -21,6 +21,7 @@ class Item(Base): id = mapped_column(Integer, primary_key=True) embedding = mapped_column(Vector(3)) half_embedding = mapped_column(Halfvec(3)) + binary_embedding = mapped_column(Bit(3)) sparse_embedding = mapped_column(Sparsevec(3)) @@ -62,7 +63,10 @@ def test_core(self): 'core_item', metadata, Column('id', Integer, primary_key=True), - Column('embedding', Vector(3)) + Column('embedding', Vector(3)), + Column('half_embedding', Halfvec(3)), + Column('binary_embedding', Bit(3)), + Column('sparse_embedding', Sparsevec(3)) ) metadata.drop_all(engine) @@ -157,6 +161,26 @@ def test_l1_distance_orm(self): items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_bit_hamming_distance(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='000')) + session.add(Item(id=2, binary_embedding='101')) + session.add(Item(id=3, binary_embedding='111')) + session.commit() + with Session(engine) as session: + items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='000')) + session.add(Item(id=2, binary_embedding='101')) + session.add(Item(id=3, binary_embedding='111')) + session.commit() + with Session(engine) as session: + items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() + assert [v.id for v in items] == [2, 3, 1] + def test_filter(self): create_items() with Session(engine) as session: From 29781ed5801fbdda121f3ab8c51bea9614131886 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:25:30 -0400 Subject: [PATCH 087/424] Added support for bit type to Peewee [skip ci] --- CHANGELOG.md | 2 +- pgvector/peewee/__init__.py | 12 +++++++++++- pgvector/peewee/bit.py | 21 +++++++++++++++++++++ tests/test_peewee.py | 21 ++++++++++++++++++++- 4 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 pgvector/peewee/bit.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 74a0f2a..a37fb09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ - Added support for `halfvec` and `sparsevec` types to Psycopg 3 - Added support for `halfvec` and `sparsevec` types to Psycopg 2 - Added support for `halfvec` and `sparsevec` types to asyncpg -- Added support for `halfvec` and `sparsevec` types to Peewee +- Added support for `halfvec`, `bit`, and `sparsevec` types to Peewee - Added `L1Distance`, `HammingDistance`, and `JaccardDistance` for Django - Added `l1_distance`, `hamming_distance`, and `jaccard_distance` for SQLAlchemy and SQLModel - Added `l1_distance` for Peewee diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 665c385..ef0df67 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,4 +1,14 @@ -from ..utils import SparseVec +from .bit import FixedBitField from .halfvec import HalfvecField from .sparsevec import SparsevecField from .vector import VectorField +from ..utils import HalfVec, SparseVec + +__all__ = [ + 'VectorField', + 'HalfvecField', + 'FixedBitField', + 'SparsevecField', + 'HalfVec', + 'SparseVec' +] diff --git a/pgvector/peewee/bit.py b/pgvector/peewee/bit.py new file mode 100644 index 0000000..8a3bfbc --- /dev/null +++ b/pgvector/peewee/bit.py @@ -0,0 +1,21 @@ +from peewee import Expression, Field, Value + + +class FixedBitField(Field): + field_type = 'bit' + + def __init__(self, max_length=None, *args, **kwargs): + self.max_length = max_length + super(FixedBitField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.max_length and [self.max_length] or None + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def hamming_distance(self, vector): + return self._distance('<~>', vector) + + def jaccard_distance(self, vector): + return self._distance('<%%>', vector) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index e429f8c..7b073e9 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField, HalfvecField, SparsevecField, SparseVec +from pgvector.peewee import VectorField, HalfvecField, FixedBitField, SparsevecField, SparseVec db = PostgresqlDatabase('pgvector_python_test') @@ -14,6 +14,7 @@ class Meta: class Item(BaseModel): embedding = VectorField(dimensions=3, null=True) half_embedding = HalfvecField(dimensions=3, null=True) + binary_embedding = FixedBitField(max_length=3, null=True) sparse_embedding = SparsevecField(dimensions=3, null=True) @@ -87,6 +88,24 @@ def test_sparsevec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_bit_hamming_distance(self): + Item.create(id=1, binary_embedding='000') + Item.create(id=2, binary_embedding='101') + Item.create(id=3, binary_embedding='111') + distance = Item.binary_embedding.hamming_distance('101') + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [0, 1, 2] + + def test_bit_jaccard_distance(self): + Item.create(id=1, binary_embedding='000') + Item.create(id=2, binary_embedding='101') + Item.create(id=3, binary_embedding='111') + distance = Item.binary_embedding.jaccard_distance('101') + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + # assert [v.distance for v in items] == [0, 1/3, 1] + def test_where(self): create_items() items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) From 70a88bc4e59b90e94d1bc83b066be3554b57ae59 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:26:12 -0400 Subject: [PATCH 088/424] Updated changelog [skip ci] --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a37fb09..14db629 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ - Added support for `halfvec`, `bit`, and `sparsevec` types to Peewee - Added `L1Distance`, `HammingDistance`, and `JaccardDistance` for Django - Added `l1_distance`, `hamming_distance`, and `jaccard_distance` for SQLAlchemy and SQLModel -- Added `l1_distance` for Peewee +- Added `l1_distance`, `hamming_distance`, and `jaccard_distance` for Peewee ## 0.2.5 (2024-02-07) From 66467240d5f47fbc2a6af755472453e6062e2286 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:32:42 -0400 Subject: [PATCH 089/424] Added test for bit with asyncpg [skip ci] --- tests/test_asyncpg.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 28cbacd..bc7a6e4 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -55,6 +55,30 @@ async def test_halfvec(self): await conn.close() + @pytest.mark.asyncio + async def test_bit(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS items') + await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding bit(3))') + + await register_vector(conn) + + embedding = asyncpg.BitString.from_int(5, 3) + await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + + res = await conn.fetch("SELECT * FROM items ORDER BY id") + assert res[0]['id'] == 1 + assert res[1]['id'] == 2 + assert res[0]['embedding'].to_int() == 5 + assert res[1]['embedding'] is None + + # ensures binary format is correct + text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + assert text_res[0]['embedding'] == '101' + + await conn.close() + @pytest.mark.asyncio async def test_sparsevec(self): conn = await asyncpg.connect(database='pgvector_python_test') From 9f8cf3b7b17ec4b9646e4ff35dd871dffbcda43c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:34:17 -0400 Subject: [PATCH 090/424] Improved test [skip ci] --- tests/test_asyncpg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index bc7a6e4..06187ef 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -64,7 +64,7 @@ async def test_bit(self): await register_vector(conn) - embedding = asyncpg.BitString.from_int(5, 3) + embedding = asyncpg.BitString.from_int(5, length=3) await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM items ORDER BY id") From 0275ac502a21683b90dfe8bf4f6a62ce5701db90 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:38:03 -0400 Subject: [PATCH 091/424] Added test for bit with Psycopg 2 [skip ci] --- tests/test_psycopg2.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index b5ac50a..7302296 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -8,7 +8,7 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') cur.execute('DROP TABLE IF EXISTS psycopg2_items') -cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), sparse_embedding sparsevec(3))') +cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') register_vector(cur) @@ -36,6 +36,15 @@ def test_halfvec(self): assert res[0][0].to_list() == [1.5, 2, 3] assert res[1][0] is None + def test_bit(self): + embedding = '101' + cur.execute('INSERT INTO psycopg2_items (binary_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT binary_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == '101' + assert res[1][0] is None + def test_sparsevec(self): embedding = SparseVec.from_dense([1.5, 2, 3]) cur.execute('INSERT INTO psycopg2_items (sparse_embedding) VALUES (%s), (NULL)', (embedding,)) From 8cd68b16f7cc0f29cde08ae4878ef49822180466 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:40:47 -0400 Subject: [PATCH 092/424] Improved tests for bit with Psycopg 3 [skip ci] --- tests/test_psycopg.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 9a7f611..0fa0789 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -114,6 +114,14 @@ def test_bit(self): res = conn.execute('SELECT %s::bit(3)', ('101',)).fetchone()[0] assert res == '101' + def test_bit_binary_format(self): + res = conn.execute('SELECT %b::bit(3)', ('101',), binary=True).fetchone()[0] + assert res == b'\x00\x00\x00\x03\xa0' + + def test_bit_text_format(self): + res = conn.execute('SELECT %t::bit(3)', ('101',)).fetchone()[0] + assert res == '101' + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) From ca5036dbc73b509a4b5f9c43cbf2e78d1e24f21b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:48:10 -0400 Subject: [PATCH 093/424] Improved test names [skip ci] --- tests/test_django.py | 8 ++++---- tests/test_peewee.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 6dc91e0..a32a774 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -130,28 +130,28 @@ def test_sparsevec(self): assert item.id == 1 assert item.sparse_embedding.to_dense() == [1, 2, 3] - def test_l2_distance(self): + def test_vector_l2_distance(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] - def test_max_inner_product(self): + def test_vector_max_inner_product(self): create_items() distance = MaxInnerProduct('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] - def test_cosine_distance(self): + def test_vector_cosine_distance(self): create_items() distance = CosineDistance('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] - def test_l1_distance(self): + def test_vector_l1_distance(self): create_items() distance = L1Distance('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 7b073e9..9b21c0d 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -40,34 +40,34 @@ class TestPeewee: def setup_method(self, test_method): Item.truncate_table() - def test_works(self): + def test_vector(self): Item.create(id=1, embedding=[1, 2, 3]) item = Item.get_by_id(1) assert np.array_equal(item.embedding, np.array([1, 2, 3])) assert item.embedding.dtype == np.float32 - def test_l2_distance(self): + def test_vector_l2_distance(self): create_items() distance = Item.embedding.l2_distance([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] - def test_max_inner_product(self): + def test_vector_max_inner_product(self): create_items() distance = Item.embedding.max_inner_product([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] - def test_cosine_distance(self): + def test_vector_cosine_distance(self): create_items() distance = Item.embedding.cosine_distance([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] - def test_l1_distance(self): + def test_vector_l1_distance(self): create_items() distance = Item.embedding.l1_distance([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) From ebe99b511d9eccf3fae5dd6195b95484818e4fb9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:51:29 -0400 Subject: [PATCH 094/424] Added support for NumPy arrays to HalfVec [skip ci] --- pgvector/utils/halfvec.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 8b760cd..75fca83 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -1,9 +1,12 @@ +import numpy as np from struct import pack, unpack_from class HalfVec: def __init__(self, value): - # TODO support np.array + if isinstance(value, np.ndarray): + value = value.tolist() + if not isinstance(value, (list, tuple)): raise ValueError('expected list or tuple') From 89b9b85a1c912d9f0562cf83153452231bc71a45 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:54:06 -0400 Subject: [PATCH 095/424] Improved Psycopg 3 tests [skip ci] --- tests/test_psycopg.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 0fa0789..306b222 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -6,47 +6,47 @@ conn = psycopg.connect(dbname='pgvector_python_test', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') -conn.execute('DROP TABLE IF EXISTS items') -conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +conn.execute('DROP TABLE IF EXISTS psycopg_items') +conn.execute('CREATE TABLE psycopg_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') register_vector(conn) class TestPsycopg: def setup_method(self, test_method): - conn.execute('DELETE FROM items') + conn.execute('DELETE FROM psycopg_items') - def test_works(self): + def test_vector(self): embedding = np.array([1.5, 2, 3]) - conn.execute('INSERT INTO items (embedding) VALUES (%s), (NULL)', (embedding,)) + conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s), (NULL)', (embedding,)) - res = conn.execute('SELECT * FROM items ORDER BY id').fetchall() + res = conn.execute('SELECT * FROM psycopg_items ORDER BY id').fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 assert res[1][1] is None - def test_binary_format(self): + def test_vector_binary_format(self): embedding = np.array([1.5, 2, 3]) res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] assert np.array_equal(res, embedding) - def test_text_format(self): + def test_vector_text_format(self): embedding = np.array([1.5, 2, 3]) res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] assert np.array_equal(res, embedding) - def test_binary_format_correct(self): + def test_vector_binary_format_correct(self): embedding = np.array([1.5, 2, 3]) res = conn.execute('SELECT %b::vector::text', (embedding,)).fetchone()[0] assert res == '[1.5,2,3]' - def test_text_format_non_contiguous(self): + def test_vector_text_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] assert np.array_equal(res, np.array([3, 2, 1.5])) - def test_binary_format_non_contiguous(self): + def test_vector_binary_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] @@ -55,19 +55,19 @@ def test_binary_format_non_contiguous(self): def test_text_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY items (embedding) FROM STDIN") as copy: - copy.write_row([embedding]) + with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: + copy.write_row([embedding, HalfVec(embedding), '101', SparseVec.from_dense(embedding)]) def test_binary_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + with cur.copy("COPY psycopg_items (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.write_row([embedding]) def test_binary_copy_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + with cur.copy("COPY psycopg_items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.set_types(['int8', 'vector']) copy.write_row([1, embedding]) @@ -127,16 +127,16 @@ async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS items') - await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS psycopg_items') + await conn.execute('CREATE TABLE psycopg_items (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector_async(conn) embedding = np.array([1.5, 2, 3]) - await conn.execute('INSERT INTO items (embedding) VALUES (%s), (NULL)', (embedding,)) + await conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s), (NULL)', (embedding,)) async with conn.cursor() as cur: - await cur.execute('SELECT * FROM items ORDER BY id') + await cur.execute('SELECT * FROM psycopg_items ORDER BY id') res = await cur.fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 From cb755fa5c6728e293512f8a6787ee4e325968aa3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 20:55:10 -0400 Subject: [PATCH 096/424] Improved test [skip ci] --- tests/test_psycopg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 306b222..f674a66 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -67,9 +67,9 @@ def test_binary_copy(self): def test_binary_copy_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY psycopg_items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.set_types(['int8', 'vector']) - copy.write_row([1, embedding]) + with cur.copy("COPY psycopg_items (id, embedding, half_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(['int8', 'vector', 'halfvec', 'sparsevec']) + copy.write_row([1, embedding, HalfVec(embedding), SparseVec.from_dense(embedding)]) def test_halfvec(self): conn.execute('DROP TABLE IF EXISTS half_items') From e1843c20722b89547b8ac44857271439796ead95 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:05:49 -0400 Subject: [PATCH 097/424] Added tests for bit for SQLModel --- tests/test_sqlmodel.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 2c81715..d0a6ccd 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec, Sparsevec, SparseVec +from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVec import pytest from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError @@ -18,6 +18,7 @@ class Item(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) embedding: Optional[Any] = Field(default=None, sa_column=Column(Vector(3))) half_embedding: Optional[Any] = Field(default=None, sa_column=Column(Halfvec(3))) + binary_embedding: Optional[Any] = Field(default=None, sa_column=Column(Bit(3))) sparse_embedding: Optional[Any] = Field(default=None, sa_column=Column(Sparsevec(3))) @@ -99,6 +100,26 @@ def test_l1_distance(self): items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_bit_hamming_distance(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='000')) + session.add(Item(id=2, binary_embedding='101')) + session.add(Item(id=3, binary_embedding='111')) + session.commit() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='000')) + session.add(Item(id=2, binary_embedding='101')) + session.add(Item(id=3, binary_embedding='111')) + session.commit() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + def test_filter(self): create_items() with Session(engine) as session: From ca571f0607d80ead8306102b2b824f03b3225d17 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:15:22 -0400 Subject: [PATCH 098/424] Added support for passing dense vectors to sparsevec columns and functions [skip ci] --- pgvector/utils/sparsevec.py | 10 ++++++++++ tests/test_sqlmodel.py | 20 ++++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 9aae64e..afcef65 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -1,3 +1,4 @@ +import numpy as np from struct import pack, unpack_from @@ -8,6 +9,8 @@ def __init__(self, dim, indices, values): self.values = values def from_dense(value): + if isinstance(value, np.ndarray): + value = value.tolist() dim = len(value) indices = [i for i, v in enumerate(value) if v != 0] values = [value[i] for i in indices] @@ -23,6 +26,9 @@ def to_db(value, dim=None): if value is None: return value + if isinstance(value, (list, np.ndarray)): + value = SparseVec.from_dense(value) + if dim is not None and value.dim != dim: raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) @@ -31,6 +37,10 @@ def to_db(value, dim=None): def to_db_binary(value): if value is None: return value + + if isinstance(value, (list, np.ndarray)): + value = SparseVec.from_dense(value) + nnz = len(value.indices) return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index d0a6ccd..068d801 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -76,30 +76,42 @@ def test_orm(self): assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None - def test_l2_distance(self): + def test_vector_l2_distance(self): create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_max_inner_product(self): + def test_vector_max_inner_product(self): create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_cosine_distance(self): + def test_vector_cosine_distance(self): create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_l1_distance(self): + def test_vector_l1_distance(self): create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_halfvec_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_bit_hamming_distance(self): session = Session(engine) session.add(Item(id=1, binary_embedding='000')) From 92cc17fb4c0f09614aed6198bd309f57ca6bfa1a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:22:13 -0400 Subject: [PATCH 099/424] Improved SparseVec code [skip ci] --- pgvector/utils/sparsevec.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index afcef65..040ef2c 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -2,6 +2,15 @@ from struct import pack, unpack_from +def to_db_value(value): + if isinstance(value, SparseVec): + return value + elif isinstance(value, (list, np.ndarray)): + return SparseVec.from_dense(value) + else: + raise ValueError('expected sparsevec') + + class SparseVec: def __init__(self, dim, indices, values): self.dim = dim @@ -26,11 +35,10 @@ def to_db(value, dim=None): if value is None: return value - if isinstance(value, (list, np.ndarray)): - value = SparseVec.from_dense(value) + value = to_db_value(value) if dim is not None and value.dim != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + raise ValueError('expected %d dimensions, not %d' % (dim, value.dim)) return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(value.indices, value.values)]) + '}/' + str(value.dim) @@ -38,9 +46,7 @@ def to_db_binary(value): if value is None: return value - if isinstance(value, (list, np.ndarray)): - value = SparseVec.from_dense(value) - + value = to_db_value(value) nnz = len(value.indices) return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) From 62e5befc993000b6e632c2f73912f051b236a7cd Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:28:07 -0400 Subject: [PATCH 100/424] Added more tests for SQLModel [skip ci] --- tests/test_sqlmodel.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 068d801..1a5b7de 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -106,12 +106,48 @@ def test_halfvec_l2_distance(self): items = session.exec(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_halfvec_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_halfvec_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_halfvec_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_sparsevec_l2_distance(self): create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_sparsevec_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_sparsevec_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_bit_hamming_distance(self): session = Session(engine) session.add(Item(id=1, binary_embedding='000')) From 705f27051eda69077ee54fa300e4b367ab73e406 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:30:31 -0400 Subject: [PATCH 101/424] Added more tests for SQLModel [skip ci] --- tests/test_sqlmodel.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 1a5b7de..58ac2c9 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -180,7 +180,7 @@ def test_select(self): items = session.exec(select(Item.embedding.l2_distance([1, 1, 1]))).all() assert items[0] == 3 - def test_avg(self): + def test_vector_avg(self): with Session(engine) as session: avg = session.exec(select(func.avg(Item.embedding))).first() assert avg is None @@ -189,7 +189,7 @@ def test_avg(self): avg = session.exec(select(func.avg(Item.embedding))).first() assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) - def test_sum(self): + def test_vector_sum(self): with Session(engine) as session: sum = session.exec(select(func.sum(Item.embedding))).first() assert sum is None @@ -198,6 +198,24 @@ def test_sum(self): sum = session.exec(select(func.sum(Item.embedding))).first() assert np.array_equal(sum, np.array([5, 7, 9])) + def test_halfvec_avg(self): + with Session(engine) as session: + avg = session.exec(select(func.avg(Item.half_embedding))).first() + assert avg is None + session.add(Item(half_embedding=[1, 2, 3])) + session.add(Item(half_embedding=[4, 5, 6])) + avg = session.exec(select(func.avg(Item.half_embedding))).first() + assert avg.to_list() == [2.5, 3.5, 4.5] + + def test_halfvec_sum(self): + with Session(engine) as session: + sum = session.exec(select(func.sum(Item.half_embedding))).first() + assert sum is None + session.add(Item(half_embedding=[1, 2, 3])) + session.add(Item(half_embedding=[4, 5, 6])) + sum = session.exec(select(func.sum(Item.half_embedding))).first() + assert sum.to_list() == [5, 7, 9] + def test_bad_dimensions(self): item = Item(embedding=[1, 2]) session = Session(engine) From 34c2f92985e37b142232a49e73808dd76eb69d10 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:32:21 -0400 Subject: [PATCH 102/424] Added more tests for SQLAlchemy [skip ci] --- tests/test_sqlalchemy.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0003be4..887859c 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -113,54 +113,66 @@ def test_orm(self): assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None - def test_l2_distance(self): + def test_vector_l2_distance(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_l2_distance_orm(self): + def test_vector_l2_distance_orm(self): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_max_inner_product(self): + def test_vector_max_inner_product(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_max_inner_product_orm(self): + def test_vector_max_inner_product_orm(self): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_cosine_distance(self): + def test_vector_cosine_distance(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_cosine_distance_orm(self): + def test_vector_cosine_distance_orm(self): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_l1_distance(self): + def test_vector_l1_distance(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_l1_distance_orm(self): + def test_vector_l1_distance_orm(self): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_halfvec_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + def test_bit_hamming_distance(self): session = Session(engine) session.add(Item(id=1, binary_embedding='000')) From 637f318b4a58f1182e488217957cd817bab6c9b5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:36:18 -0400 Subject: [PATCH 103/424] Added more tests for Peewee [skip ci] --- tests/test_peewee.py | 62 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 9b21c0d..c9bd085 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -81,6 +81,27 @@ def test_halfvec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_halfvec_max_inner_product(self): + create_items() + distance = Item.half_embedding.max_inner_product([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_halfvec_cosine_distance(self): + create_items() + distance = Item.half_embedding.cosine_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_halfvec_l1_distance(self): + create_items() + distance = Item.half_embedding.l1_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_sparsevec_l2_distance(self): create_items() distance = Item.sparse_embedding.l2_distance(SparseVec.from_dense([1, 1, 1])) @@ -88,6 +109,27 @@ def test_sparsevec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_sparsevec_max_inner_product(self): + create_items() + distance = Item.sparse_embedding.max_inner_product([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_sparsevec_cosine_distance(self): + create_items() + distance = Item.sparse_embedding.cosine_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_sparsevec_l1_distance(self): + create_items() + distance = Item.sparse_embedding.l1_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_bit_hamming_distance(self): Item.create(id=1, binary_embedding='000') Item.create(id=2, binary_embedding='101') @@ -111,7 +153,7 @@ def test_where(self): items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) assert [v.id for v in items] == [1] - def test_avg(self): + def test_vector_avg(self): avg = Item.select(fn.avg(Item.embedding)).scalar() assert avg is None Item.create(embedding=[1, 2, 3]) @@ -119,7 +161,7 @@ def test_avg(self): avg = Item.select(fn.avg(Item.embedding)).scalar() assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) - def test_sum(self): + def test_vector_sum(self): sum = Item.select(fn.sum(Item.embedding)).scalar() assert sum is None Item.create(embedding=[1, 2, 3]) @@ -127,6 +169,22 @@ def test_sum(self): sum = Item.select(fn.sum(Item.embedding)).scalar() assert np.array_equal(sum, np.array([5, 7, 9])) + def test_halfvec_avg(self): + avg = Item.select(fn.avg(Item.half_embedding)).scalar() + assert avg is None + Item.create(half_embedding=[1, 2, 3]) + Item.create(half_embedding=[4, 5, 6]) + avg = Item.select(fn.avg(Item.half_embedding)).scalar() + assert avg.to_list() == [2.5, 3.5, 4.5] + + def test_halfvec_sum(self): + sum = Item.select(fn.sum(Item.half_embedding)).scalar() + assert sum is None + Item.create(half_embedding=[1, 2, 3]) + Item.create(half_embedding=[4, 5, 6]) + sum = Item.select(fn.sum(Item.half_embedding)).scalar() + assert sum.to_list() == [5, 7, 9] + def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) Item.get_or_create(embedding=np.array([4, 5, 6])) From 84c25d843d49d52024df490fbb31e470995583a3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:40:16 -0400 Subject: [PATCH 104/424] Added more tests for Django [skip ci] --- tests/test_django.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_django.py b/tests/test_django.py index a32a774..2281d8a 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -165,6 +165,27 @@ def test_halfvec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_halfvec_max_inner_product(self): + create_items() + distance = MaxInnerProduct('half_embedding', HalfVec([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_halfvec_cosine_distance(self): + create_items() + distance = CosineDistance('half_embedding', HalfVec([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_halfvec_l1_distance(self): + create_items() + distance = L1Distance('half_embedding', HalfVec([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_sparsevec_l2_distance(self): create_items() distance = L2Distance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) @@ -172,6 +193,27 @@ def test_sparsevec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] + def test_sparsevec_max_inner_product(self): + create_items() + distance = MaxInnerProduct('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_sparsevec_cosine_distance(self): + create_items() + distance = CosineDistance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_sparsevec_l1_distance(self): + create_items() + distance = L1Distance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_bit_hamming_distance(self): Item(id=1, binary_embedding='000').save() Item(id=2, binary_embedding='101').save() From ae6384b2b395721b2c9009bb20bc463e522ae70d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:42:01 -0400 Subject: [PATCH 105/424] Added more tests for Django [skip ci] --- tests/test_django.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 2281d8a..12d9c37 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -238,7 +238,7 @@ def test_filter(self): items = Item.objects.alias(distance=distance).filter(distance__lt=1) assert [v.id for v in items] == [1] - def test_avg(self): + def test_vector_avg(self): avg = Item.objects.aggregate(Avg('embedding'))['embedding__avg'] assert avg is None Item(embedding=[1, 2, 3]).save() @@ -246,7 +246,7 @@ def test_avg(self): avg = Item.objects.aggregate(Avg('embedding'))['embedding__avg'] assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) - def test_sum(self): + def test_vector_sum(self): sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] assert sum is None Item(embedding=[1, 2, 3]).save() @@ -254,6 +254,22 @@ def test_sum(self): sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] assert np.array_equal(sum, np.array([5, 7, 9])) + def test_halfvec_avg(self): + avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] + assert avg is None + Item(half_embedding=[1, 2, 3]).save() + Item(half_embedding=[4, 5, 6]).save() + avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] + assert avg.to_list() == [2.5, 3.5, 4.5] + + def test_halfvec_sum(self): + sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] + assert sum is None + Item(half_embedding=[1, 2, 3]).save() + Item(half_embedding=[4, 5, 6]).save() + sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] + assert sum.to_list() == [5, 7, 9] + def test_serialization(self): create_items() items = Item.objects.all() From 16508ecc16977a14953d80e3f69348cf55af1a5e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:44:27 -0400 Subject: [PATCH 106/424] Added bit test for Django [skip ci] --- tests/test_django.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_django.py b/tests/test_django.py index 12d9c37..a8b1adf 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -123,6 +123,13 @@ def test_halfvec(self): assert item.id == 1 assert item.half_embedding.to_list() == [1, 2, 3] + def test_bit(self): + item = Item(id=1, binary_embedding='101') + item.save() + item = Item.objects.get(pk=1) + assert item.id == 1 + assert item.binary_embedding == '101' + def test_sparsevec(self): item = Item(id=1, sparse_embedding=SparseVec.from_dense([1, 2, 3])) item.save() From 5c8711537e8202bc438dffde15e7d0560367c96f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:48:08 -0400 Subject: [PATCH 107/424] Updated table name for asyncpg tests [skip ci] --- tests/test_asyncpg.py | 48 +++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 06187ef..7405631 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -10,15 +10,15 @@ class TestAsyncpg: async def test_vector(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS items') - await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector(conn) embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM items ORDER BY id") + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert np.array_equal(res[0]['embedding'], embedding) @@ -26,7 +26,7 @@ async def test_vector(self): assert res[1]['embedding'] is None # ensures binary format is correct - text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") assert text_res[0]['embedding'] == '[1.5,2,3]' await conn.close() @@ -35,22 +35,22 @@ async def test_vector(self): async def test_halfvec(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS items') - await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding halfvec(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding halfvec(3))') await register_vector(conn) embedding = [1.5, 2, 3] - await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM items ORDER BY id") + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert res[0]['embedding'].to_list() == [1.5, 2, 3] assert res[1]['embedding'] is None # ensures binary format is correct - text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") assert text_res[0]['embedding'] == '[1.5,2,3]' await conn.close() @@ -59,22 +59,22 @@ async def test_halfvec(self): async def test_bit(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS items') - await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding bit(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding bit(3))') await register_vector(conn) embedding = asyncpg.BitString.from_int(5, length=3) - await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM items ORDER BY id") + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert res[0]['embedding'].to_int() == 5 assert res[1]['embedding'] is None # ensures binary format is correct - text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") assert text_res[0]['embedding'] == '101' await conn.close() @@ -83,22 +83,22 @@ async def test_bit(self): async def test_sparsevec(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS items') - await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding sparsevec(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding sparsevec(3))') await register_vector(conn) embedding = SparseVec.from_dense([1.5, 2, 3]) - await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM items ORDER BY id") + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert res[0]['embedding'].to_dense() == [1.5, 2, 3] assert res[1]['embedding'] is None # ensures binary format is correct - text_res = await conn.fetch("SELECT embedding::text FROM items ORDER BY id LIMIT 1") + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") assert text_res[0]['embedding'] == '{1:1.5,2:2,3:3}/3' await conn.close() @@ -112,13 +112,13 @@ async def init(conn): async with pool.acquire() as conn: await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS items') - await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding vector(3))') embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO items (embedding) VALUES ($1), (NULL)", embedding) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) - res = await conn.fetch("SELECT * FROM items ORDER BY id") + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert res[0]['id'] == 1 assert res[1]['id'] == 2 assert np.array_equal(res[0]['embedding'], embedding) From a761245b0a967cd6f2d230c7fb4f3c514a822060 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 16 May 2024 21:52:30 -0400 Subject: [PATCH 108/424] Updated table name for Peewee tests [skip ci] --- tests/test_peewee.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index c9bd085..4e7eb44 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -17,6 +17,9 @@ class Item(BaseModel): binary_embedding = FixedBitField(max_length=3, null=True) sparse_embedding = SparsevecField(dimensions=3, null=True) + class Meta: + table_name = 'peewee_item' + Item.add_index('embedding vector_l2_ops', using='hnsw') From 92fe82326e7f6262471a63fa42ef8c4d059b44a1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 12:58:21 -0400 Subject: [PATCH 109/424] Renamed HalfVec to HalfVector and SparseVec to SparseVector --- pgvector/asyncpg/__init__.py | 10 +++++----- pgvector/django/__init__.py | 6 +++--- pgvector/django/functions.py | 10 +++++----- pgvector/django/halfvec.py | 8 ++++---- pgvector/django/sparsevec.py | 8 ++++---- pgvector/peewee/__init__.py | 6 +++--- pgvector/peewee/halfvec.py | 6 +++--- pgvector/peewee/sparsevec.py | 6 +++--- pgvector/psycopg/__init__.py | 2 +- pgvector/psycopg/halfvec.py | 30 +++++++++++++++--------------- pgvector/psycopg/sparsevec.py | 30 +++++++++++++++--------------- pgvector/psycopg2/__init__.py | 2 +- pgvector/psycopg2/halfvec.py | 8 ++++---- pgvector/psycopg2/sparsevec.py | 8 ++++---- pgvector/sqlalchemy/__init__.py | 2 +- pgvector/sqlalchemy/halfvec.py | 8 ++++---- pgvector/sqlalchemy/sparsevec.py | 8 ++++---- pgvector/utils/halfvec.py | 16 ++++++++-------- pgvector/utils/sparsevec.py | 18 +++++++++--------- tests/test_asyncpg.py | 4 ++-- tests/test_django.py | 22 +++++++++++----------- tests/test_peewee.py | 6 +++--- tests/test_psycopg.py | 18 +++++++++--------- tests/test_psycopg2.py | 4 ++-- tests/test_sqlalchemy.py | 4 ++-- tests/test_sqlmodel.py | 4 ++-- 26 files changed, 127 insertions(+), 127 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 9c6b660..2252c7a 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,4 +1,4 @@ -from ..utils import Vector, HalfVec, SparseVec +from ..utils import Vector, HalfVector, SparseVector __all__ = ['register_vector'] @@ -13,14 +13,14 @@ async def register_vector(conn): await conn.set_type_codec( 'halfvec', - encoder=HalfVec.to_db_binary, - decoder=HalfVec.from_db_binary, + encoder=HalfVector.to_db_binary, + decoder=HalfVector.from_db_binary, format='binary' ) await conn.set_type_codec( 'sparsevec', - encoder=SparseVec.to_db_binary, - decoder=SparseVec.from_db_binary, + encoder=SparseVector.to_db_binary, + decoder=SparseVector.from_db_binary, format='binary' ) diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 677d77e..08407c7 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -5,7 +5,7 @@ from .indexes import IvfflatIndex, HnswIndex from .sparsevec import SparsevecField from .vector import VectorField -from ..utils import HalfVec, SparseVec +from ..utils import HalfVector, SparseVector __all__ = [ 'VectorExtension', @@ -21,6 +21,6 @@ 'L1Distance', 'HammingDistance', 'JaccardDistance', - 'HalfVec', - 'SparseVec' + 'HalfVector', + 'SparseVector' ] diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index fe6f2c5..9b11869 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -1,5 +1,5 @@ from django.db.models import FloatField, Func, Value -from ..utils import Vector, HalfVec, SparseVec +from ..utils import Vector, HalfVector, SparseVector class DistanceBase(Func): @@ -7,10 +7,10 @@ class DistanceBase(Func): def __init__(self, expression, vector, **extra): if not hasattr(vector, 'resolve_expression'): - if isinstance(vector, HalfVec): - vector = Value(HalfVec.to_db(vector)) - elif isinstance(vector, SparseVec): - vector = Value(SparseVec.to_db(vector)) + if isinstance(vector, HalfVector): + vector = Value(HalfVector.to_db(vector)) + elif isinstance(vector, SparseVector): + vector = Value(SparseVector.to_db(vector)) else: vector = Value(Vector.to_db(vector)) super().__init__(expression, vector, **extra) diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 2fa9e52..884f706 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -1,5 +1,5 @@ from django.db.models import Field -from ..utils import HalfVec +from ..utils import HalfVector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ @@ -23,13 +23,13 @@ def db_type(self, connection): return 'halfvec(%d)' % self.dimensions def from_db_value(self, value, expression, connection): - return HalfVec.from_db(value) + return HalfVector.from_db(value) def to_python(self, value): - return HalfVec.from_db(value) + return HalfVector.from_db(value) def get_prep_value(self, value): - return HalfVec.to_db(value) + return HalfVector.to_db(value) def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index b7f4c79..1f4c21f 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -1,5 +1,5 @@ from django.db.models import Field -from ..utils import SparseVec +from ..utils import SparseVector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ @@ -23,13 +23,13 @@ def db_type(self, connection): return 'sparsevec(%d)' % self.dimensions def from_db_value(self, value, expression, connection): - return SparseVec.from_db(value) + return SparseVector.from_db(value) def to_python(self, value): - return SparseVec.from_db(value) + return SparseVector.from_db(value) def get_prep_value(self, value): - return SparseVec.to_db(value) + return SparseVector.to_db(value) def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index ef0df67..8f54401 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -2,13 +2,13 @@ from .halfvec import HalfvecField from .sparsevec import SparsevecField from .vector import VectorField -from ..utils import HalfVec, SparseVec +from ..utils import HalfVector, SparseVector __all__ = [ 'VectorField', 'HalfvecField', 'FixedBitField', 'SparsevecField', - 'HalfVec', - 'SparseVec' + 'HalfVector', + 'SparseVector' ] diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index 60edd83..e663d72 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -1,5 +1,5 @@ from peewee import Expression, Field, Value -from ..utils import HalfVec +from ..utils import HalfVector class HalfvecField(Field): @@ -13,10 +13,10 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return HalfVec.to_db(value) + return HalfVector.to_db(value) def python_value(self, value): - return HalfVec.from_db(value) + return HalfVector.from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index c44d4fe..272c9c6 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -1,5 +1,5 @@ from peewee import Expression, Field, Value -from ..utils import SparseVec +from ..utils import SparseVector class SparsevecField(Field): @@ -13,10 +13,10 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return SparseVec.to_db(value) + return SparseVector.to_db(value) def python_value(self, value): - return SparseVec.from_db(value) + return SparseVector.from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index b5df2d1..e47d699 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -3,7 +3,7 @@ from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import HalfVec, SparseVec +from ..utils import HalfVector, SparseVector __all__ = ['register_vector'] diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py index 023586a..6ca232d 100644 --- a/pgvector/psycopg/halfvec.py +++ b/pgvector/psycopg/halfvec.py @@ -1,53 +1,53 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import HalfVec +from ..utils import HalfVector -class HalfVecDumper(Dumper): +class HalfVectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return HalfVec.to_db(obj).encode('utf8') + return HalfVector.to_db(obj).encode('utf8') -class HalfVecBinaryDumper(HalfVecDumper): +class HalfVectorBinaryDumper(HalfVectorDumper): format = Format.BINARY def dump(self, obj): - return HalfVec.to_db_binary(obj) + return HalfVector.to_db_binary(obj) -class HalfVecLoader(Loader): +class HalfVectorLoader(Loader): format = Format.TEXT def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return HalfVec.from_db(data.decode('utf8')) + return HalfVector.from_db(data.decode('utf8')) -class HalfVecBinaryLoader(HalfVecLoader): +class HalfVectorBinaryLoader(HalfVectorLoader): format = Format.BINARY def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return HalfVec.from_db_binary(data) + return HalfVector.from_db_binary(data) def register_halfvec_info(context, info): info.register(context) # add oid to anonymous class for set_types - text_dumper = type('', (HalfVecDumper,), {'oid': info.oid}) - binary_dumper = type('', (HalfVecBinaryDumper,), {'oid': info.oid}) + text_dumper = type('', (HalfVectorDumper,), {'oid': info.oid}) + binary_dumper = type('', (HalfVectorBinaryDumper,), {'oid': info.oid}) adapters = context.adapters - adapters.register_dumper(HalfVec, text_dumper) - adapters.register_dumper(HalfVec, binary_dumper) - adapters.register_loader(info.oid, HalfVecLoader) - adapters.register_loader(info.oid, HalfVecBinaryLoader) + adapters.register_dumper(HalfVector, text_dumper) + adapters.register_dumper(HalfVector, binary_dumper) + adapters.register_loader(info.oid, HalfVectorLoader) + adapters.register_loader(info.oid, HalfVectorBinaryLoader) diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py index a5dec21..634d83b 100644 --- a/pgvector/psycopg/sparsevec.py +++ b/pgvector/psycopg/sparsevec.py @@ -1,53 +1,53 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import SparseVec +from ..utils import SparseVector -class SparseVecDumper(Dumper): +class SparseVectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return SparseVec.to_db(obj).encode('utf8') + return SparseVector.to_db(obj).encode('utf8') -class SparseVecBinaryDumper(SparseVecDumper): +class SparseVectorBinaryDumper(SparseVectorDumper): format = Format.BINARY def dump(self, obj): - return SparseVec.to_db_binary(obj) + return SparseVector.to_db_binary(obj) -class SparseVecLoader(Loader): +class SparseVectorLoader(Loader): format = Format.TEXT def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return SparseVec.from_db(data.decode('utf8')) + return SparseVector.from_db(data.decode('utf8')) -class SparseVecBinaryLoader(SparseVecLoader): +class SparseVectorBinaryLoader(SparseVectorLoader): format = Format.BINARY def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return SparseVec.from_db_binary(data) + return SparseVector.from_db_binary(data) def register_sparsevec_info(context, info): info.register(context) # add oid to anonymous class for set_types - text_dumper = type('', (SparseVecDumper,), {'oid': info.oid}) - binary_dumper = type('', (SparseVecBinaryDumper,), {'oid': info.oid}) + text_dumper = type('', (SparseVectorDumper,), {'oid': info.oid}) + binary_dumper = type('', (SparseVectorBinaryDumper,), {'oid': info.oid}) adapters = context.adapters - adapters.register_dumper(SparseVec, text_dumper) - adapters.register_dumper(SparseVec, binary_dumper) - adapters.register_loader(info.oid, SparseVecLoader) - adapters.register_loader(info.oid, SparseVecBinaryLoader) + adapters.register_dumper(SparseVector, text_dumper) + adapters.register_dumper(SparseVector, binary_dumper) + adapters.register_loader(info.oid, SparseVectorLoader) + adapters.register_loader(info.oid, SparseVectorBinaryLoader) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 764c0f7..9d0473a 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -2,7 +2,7 @@ from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import SparseVec +from ..utils import SparseVector __all__ = ['register_vector'] diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index 8d974b3..e5ec111 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -1,5 +1,5 @@ from psycopg2.extensions import adapt, new_type, register_adapter, register_type -from ..utils import HalfVec +from ..utils import HalfVector class HalfvecAdapter(object): @@ -7,14 +7,14 @@ def __init__(self, value): self._value = value def getquoted(self): - return adapt(HalfVec.to_db(self._value)).getquoted() + return adapt(HalfVector.to_db(self._value)).getquoted() def cast_halfvec(value, cur): - return HalfVec.from_db(value) + return HalfVector.from_db(value) def register_halfvec_info(oid): halfvec = new_type((oid,), 'HALFVEC', cast_halfvec) register_type(halfvec) - register_adapter(HalfVec, HalfvecAdapter) + register_adapter(HalfVector, HalfvecAdapter) diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index 6603b2f..4bcdd92 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -1,5 +1,5 @@ from psycopg2.extensions import adapt, new_type, register_adapter, register_type -from ..utils import SparseVec +from ..utils import SparseVector class SparsevecAdapter(object): @@ -7,14 +7,14 @@ def __init__(self, value): self._value = value def getquoted(self): - return adapt(SparseVec.to_db(self._value)).getquoted() + return adapt(SparseVector.to_db(self._value)).getquoted() def cast_sparsevec(value, cur): - return SparseVec.from_db(value) + return SparseVector.from_db(value) def register_sparsevec_info(oid): sparsevec = new_type((oid,), 'SPARSEVEC', cast_sparsevec) register_type(sparsevec) - register_adapter(SparseVec, SparsevecAdapter) + register_adapter(SparseVector, SparsevecAdapter) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 6e5a206..fde5752 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -2,6 +2,6 @@ from .halfvec import Halfvec from .sparsevec import Sparsevec from .vector import Vector -from ..utils import SparseVec +from ..utils import SparseVector __all__ = ['Vector', 'Halfvec', 'Bit', 'Sparsevec'] diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py index 694d14f..cafc6c6 100644 --- a/pgvector/sqlalchemy/halfvec.py +++ b/pgvector/sqlalchemy/halfvec.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import HalfVec +from ..utils import HalfVector class Halfvec(UserDefinedType): @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return HalfVec.to_db(value, self.dim) + return HalfVector.to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(HalfVec.to_db(value, self.dim)) + return string_literal_processor(HalfVector.to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return HalfVec.from_db(value) + return HalfVector.from_db(value) return process class comparator_factory(UserDefinedType.Comparator): diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py index 6e3c691..499dff0 100644 --- a/pgvector/sqlalchemy/sparsevec.py +++ b/pgvector/sqlalchemy/sparsevec.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import SparseVec +from ..utils import SparseVector class Sparsevec(UserDefinedType): @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return SparseVec.to_db(value, self.dim) + return SparseVector.to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(SparseVec.to_db(value, self.dim)) + return string_literal_processor(SparseVector.to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return SparseVec.from_db(value) + return SparseVector.from_db(value) return process class comparator_factory(UserDefinedType.Comparator): diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 75fca83..b977291 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -2,7 +2,7 @@ from struct import pack, unpack_from -class HalfVec: +class HalfVector: def __init__(self, value): if isinstance(value, np.ndarray): value = value.tolist() @@ -18,7 +18,7 @@ def to_list(self): def to_db(value, dim=None): if value is None: return value - if isinstance(value, HalfVec): + if isinstance(value, HalfVector): value = value.value if dim is not None and len(value) != dim: @@ -29,20 +29,20 @@ def to_db(value, dim=None): def to_db_binary(value): if value is None: return value - if isinstance(value, HalfVec): + if isinstance(value, HalfVector): value = value.value return pack(f'>HH{len(value)}e', len(value), 0, *value) def from_db(value): - if value is None or isinstance(value, HalfVec): + if value is None or isinstance(value, HalfVector): return value - return HalfVec([float(v) for v in value[1:-1].split(',')]) + return HalfVector([float(v) for v in value[1:-1].split(',')]) def from_db_binary(value): - if value is None or isinstance(value, HalfVec): + if value is None or isinstance(value, HalfVector): return value dim, unused = unpack_from('>HH', value) - return HalfVec(unpack_from(f'>{dim}e', value, 4)) + return HalfVector(unpack_from(f'>{dim}e', value, 4)) def __repr__(self): - return f'HalfVec({self.value})' + return f'HalfVector({self.value})' diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 040ef2c..0c6885c 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -3,15 +3,15 @@ def to_db_value(value): - if isinstance(value, SparseVec): + if isinstance(value, SparseVector): return value elif isinstance(value, (list, np.ndarray)): - return SparseVec.from_dense(value) + return SparseVector.from_dense(value) else: raise ValueError('expected sparsevec') -class SparseVec: +class SparseVector: def __init__(self, dim, indices, values): self.dim = dim self.indices = indices @@ -23,7 +23,7 @@ def from_dense(value): dim = len(value) indices = [i for i, v in enumerate(value) if v != 0] values = [value[i] for i in indices] - return SparseVec(dim, indices, values) + return SparseVector(dim, indices, values) def to_dense(self): vec = [0] * self.dim @@ -51,7 +51,7 @@ def to_db_binary(value): return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) def from_db(value): - if value is None or isinstance(value, SparseVec): + if value is None or isinstance(value, SparseVector): return value elements, dim = value.split('/') indices = [] @@ -60,15 +60,15 @@ def from_db(value): i, v = e.split(':') indices.append(int(i) - 1) values.append(float(v)) - return SparseVec(int(dim), indices, values) + return SparseVector(int(dim), indices, values) def from_db_binary(value): - if value is None or isinstance(value, SparseVec): + if value is None or isinstance(value, SparseVector): return value dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) - return SparseVec(int(dim), indices, values) + return SparseVector(int(dim), indices, values) def __repr__(self): - return f'SparseVec({self.dim}, {self.indices}, {self.values})' + return f'SparseVector({self.dim}, {self.indices}, {self.values})' diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 7405631..474ba92 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,7 +1,7 @@ import asyncio import asyncpg import numpy as np -from pgvector.asyncpg import register_vector, SparseVec +from pgvector.asyncpg import register_vector, SparseVector import pytest @@ -88,7 +88,7 @@ async def test_sparsevec(self): await register_vector(conn) - embedding = SparseVec.from_dense([1.5, 2, 3]) + embedding = SparseVector.from_dense([1.5, 2, 3]) await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") diff --git a/tests/test_django.py b/tests/test_django.py index a8b1adf..586e9ad 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfvecField, BitField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVec, SparseVec +from pgvector.django import VectorExtension, VectorField, HalfvecField, BitField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVector, SparseVector from unittest import mock settings.configure( @@ -94,7 +94,7 @@ def create_items(): [1, 1, 2] ] for i, v in enumerate(vectors): - item = Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v)) + item = Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v)) item.save() @@ -131,7 +131,7 @@ def test_bit(self): assert item.binary_embedding == '101' def test_sparsevec(self): - item = Item(id=1, sparse_embedding=SparseVec.from_dense([1, 2, 3])) + item = Item(id=1, sparse_embedding=SparseVector.from_dense([1, 2, 3])) item.save() item = Item.objects.get(pk=1) assert item.id == 1 @@ -167,56 +167,56 @@ def test_vector_l1_distance(self): def test_halfvec_l2_distance(self): create_items() - distance = L2Distance('half_embedding', HalfVec([1, 1, 1])) + distance = L2Distance('half_embedding', HalfVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] def test_halfvec_max_inner_product(self): create_items() - distance = MaxInnerProduct('half_embedding', HalfVec([1, 1, 1])) + distance = MaxInnerProduct('half_embedding', HalfVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] def test_halfvec_cosine_distance(self): create_items() - distance = CosineDistance('half_embedding', HalfVec([1, 1, 1])) + distance = CosineDistance('half_embedding', HalfVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] def test_halfvec_l1_distance(self): create_items() - distance = L1Distance('half_embedding', HalfVec([1, 1, 1])) + distance = L1Distance('half_embedding', HalfVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] def test_sparsevec_l2_distance(self): create_items() - distance = L2Distance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + distance = L2Distance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] def test_sparsevec_max_inner_product(self): create_items() - distance = MaxInnerProduct('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + distance = MaxInnerProduct('sparse_embedding', SparseVector.from_dense([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] def test_sparsevec_cosine_distance(self): create_items() - distance = CosineDistance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + distance = CosineDistance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] def test_sparsevec_l1_distance(self): create_items() - distance = L1Distance('sparse_embedding', SparseVec.from_dense([1, 1, 1])) + distance = L1Distance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 4e7eb44..5c8ee5a 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField, HalfvecField, FixedBitField, SparsevecField, SparseVec +from pgvector.peewee import VectorField, HalfvecField, FixedBitField, SparsevecField, SparseVector db = PostgresqlDatabase('pgvector_python_test') @@ -36,7 +36,7 @@ def create_items(): [1, 1, 2] ] for i, v in enumerate(vectors): - Item.create(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v)) + Item.create(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v)) class TestPeewee: @@ -107,7 +107,7 @@ def test_halfvec_l1_distance(self): def test_sparsevec_l2_distance(self): create_items() - distance = Item.sparse_embedding.l2_distance(SparseVec.from_dense([1, 1, 1])) + distance = Item.sparse_embedding.l2_distance(SparseVector.from_dense([1, 1, 1])) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index f674a66..aeca9d9 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg import register_vector, register_vector_async, HalfVec, SparseVec +from pgvector.psycopg import register_vector, register_vector_async, HalfVector, SparseVector import psycopg import pytest @@ -56,7 +56,7 @@ def test_text_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: - copy.write_row([embedding, HalfVec(embedding), '101', SparseVec.from_dense(embedding)]) + copy.write_row([embedding, HalfVector(embedding), '101', SparseVector.from_dense(embedding)]) def test_binary_copy(self): embedding = np.array([1.5, 2, 3]) @@ -69,24 +69,24 @@ def test_binary_copy_set_types(self): cur = conn.cursor() with cur.copy("COPY psycopg_items (id, embedding, half_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.set_types(['int8', 'vector', 'halfvec', 'sparsevec']) - copy.write_row([1, embedding, HalfVec(embedding), SparseVec.from_dense(embedding)]) + copy.write_row([1, embedding, HalfVector(embedding), SparseVector.from_dense(embedding)]) def test_halfvec(self): conn.execute('DROP TABLE IF EXISTS half_items') conn.execute('CREATE TABLE half_items (id bigserial PRIMARY KEY, embedding halfvec(3))') - embedding = HalfVec([1.5, 2, 3]) + embedding = HalfVector([1.5, 2, 3]) conn.execute('INSERT INTO half_items (embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT * FROM half_items ORDER BY id').fetchall() def test_halfvec_binary_format(self): - embedding = HalfVec([1.5, 2, 3]) + embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] assert res.to_list() == [1.5, 2, 3] def test_halfvec_text_format(self): - embedding = HalfVec([1.5, 2, 3]) + embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res.to_list() == [1.5, 2, 3] @@ -94,19 +94,19 @@ def test_sparsevec(self): conn.execute('DROP TABLE IF EXISTS sparse_items') conn.execute('CREATE TABLE sparse_items (id bigserial PRIMARY KEY, embedding sparsevec(6))') - embedding = SparseVec.from_dense([0, 1.5, 0, 2, 0, 3]) + embedding = SparseVector.from_dense([0, 1.5, 0, 2, 0, 3]) conn.execute('INSERT INTO sparse_items (embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT * FROM sparse_items ORDER BY id').fetchall() assert res[0][1].to_dense() == [0, 1.5, 0, 2, 0, 3] def test_sparsevec_binary_format(self): - embedding = SparseVec.from_dense([1.5, 2, 3]) + embedding = SparseVector.from_dense([1.5, 2, 3]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res.to_dense() == [1.5, 2, 3] def test_sparsevec_text_format(self): - embedding = SparseVec.from_dense([1.5, 2, 3]) + embedding = SparseVector.from_dense([1.5, 2, 3]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res.to_dense() == [1.5, 2, 3] diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 7302296..cc7e5c0 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg2 import register_vector, SparseVec +from pgvector.psycopg2 import register_vector, SparseVector import psycopg2 conn = psycopg2.connect(dbname='pgvector_python_test') @@ -46,7 +46,7 @@ def test_bit(self): assert res[1][0] is None def test_sparsevec(self): - embedding = SparseVec.from_dense([1.5, 2, 3]) + embedding = SparseVector.from_dense([1.5, 2, 3]) cur.execute('INSERT INTO psycopg2_items (sparse_embedding) VALUES (%s), (NULL)', (embedding,)) cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 887859c..9378c82 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVec +from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVector import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError @@ -46,7 +46,7 @@ def create_items(): ] session = Session(engine) for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v))) + session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v))) session.commit() diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 58ac2c9..86f0110 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVec +from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVector import pytest from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError @@ -43,7 +43,7 @@ def create_items(): ] session = Session(engine) for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVec.from_dense(v))) + session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v))) session.commit() From 2a036aebcfec5289876383721eb2bf6fb6f4ba6e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 14:21:35 -0400 Subject: [PATCH 110/424] Renamed HalfvecField to HalfVectorField and SparsevecField to SparseVectorField --- pgvector/django/__init__.py | 8 ++++---- pgvector/django/halfvec.py | 2 +- pgvector/django/sparsevec.py | 2 +- pgvector/peewee/__init__.py | 8 ++++---- pgvector/peewee/halfvec.py | 4 ++-- pgvector/peewee/sparsevec.py | 4 ++-- tests/test_django.py | 10 +++++----- tests/test_peewee.py | 6 +++--- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 08407c7..09978a9 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,18 +1,18 @@ from .bit import BitField from .extensions import VectorExtension from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance -from .halfvec import HalfvecField +from .halfvec import HalfVectorField from .indexes import IvfflatIndex, HnswIndex -from .sparsevec import SparsevecField +from .sparsevec import SparseVectorField from .vector import VectorField from ..utils import HalfVector, SparseVector __all__ = [ 'VectorExtension', 'VectorField', - 'HalfvecField', + 'HalfVectorField', 'BitField', - 'SparsevecField', + 'SparseVectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 884f706..1c3bae5 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -3,7 +3,7 @@ # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ -class HalfvecField(Field): +class HalfVectorField(Field): description = 'Half vector' empty_strings_allowed = False diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index 1f4c21f..3a06574 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -3,7 +3,7 @@ # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ -class SparsevecField(Field): +class SparseVectorField(Field): description = 'Sparse vector' empty_strings_allowed = False diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 8f54401..945e0dc 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,14 +1,14 @@ from .bit import FixedBitField -from .halfvec import HalfvecField -from .sparsevec import SparsevecField +from .halfvec import HalfVectorField +from .sparsevec import SparseVectorField from .vector import VectorField from ..utils import HalfVector, SparseVector __all__ = [ 'VectorField', - 'HalfvecField', + 'HalfVectorField', 'FixedBitField', - 'SparsevecField', + 'SparseVectorField', 'HalfVector', 'SparseVector' ] diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index e663d72..e30dcd0 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -2,12 +2,12 @@ from ..utils import HalfVector -class HalfvecField(Field): +class HalfVectorField(Field): field_type = 'halfvec' def __init__(self, dimensions=None, *args, **kwargs): self.dimensions = dimensions - super(HalfvecField, self).__init__(*args, **kwargs) + super(HalfVectorField, self).__init__(*args, **kwargs) def get_modifiers(self): return self.dimensions and [self.dimensions] or None diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index 272c9c6..b6d3b91 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -2,12 +2,12 @@ from ..utils import SparseVector -class SparsevecField(Field): +class SparseVectorField(Field): field_type = 'sparsevec' def __init__(self, dimensions=None, *args, **kwargs): self.dimensions = dimensions - super(SparsevecField, self).__init__(*args, **kwargs) + super(SparseVectorField, self).__init__(*args, **kwargs) def get_modifiers(self): return self.dimensions and [self.dimensions] or None diff --git a/tests/test_django.py b/tests/test_django.py index 586e9ad..b5c67e4 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -8,7 +8,7 @@ from math import sqrt import numpy as np import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfvecField, BitField, SparsevecField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVector, SparseVector +from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVector, SparseVector from unittest import mock settings.configure( @@ -24,9 +24,9 @@ class Item(models.Model): embedding = VectorField(dimensions=3, null=True, blank=True) - half_embedding = HalfvecField(dimensions=3, null=True, blank=True) + half_embedding = HalfVectorField(dimensions=3, null=True, blank=True) binary_embedding = BitField(length=3, null=True, blank=True) - sparse_embedding = SparsevecField(dimensions=3, null=True, blank=True) + sparse_embedding = SparseVectorField(dimensions=3, null=True, blank=True) class Meta: app_label = 'myapp' @@ -60,9 +60,9 @@ class Migration(migrations.Migration): fields=[ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('embedding', pgvector.django.VectorField(dimensions=3, null=True, blank=True)), - ('half_embedding', pgvector.django.HalfvecField(dimensions=3, null=True, blank=True)), + ('half_embedding', pgvector.django.HalfVectorField(dimensions=3, null=True, blank=True)), ('binary_embedding', pgvector.django.BitField(length=3, null=True, blank=True)), - ('sparse_embedding', pgvector.django.SparsevecField(dimensions=3, null=True, blank=True)), + ('sparse_embedding', pgvector.django.SparseVectorField(dimensions=3, null=True, blank=True)), ], ), migrations.AddIndex( diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 5c8ee5a..d83ef6f 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField, HalfvecField, FixedBitField, SparsevecField, SparseVector +from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField, SparseVector db = PostgresqlDatabase('pgvector_python_test') @@ -13,9 +13,9 @@ class Meta: class Item(BaseModel): embedding = VectorField(dimensions=3, null=True) - half_embedding = HalfvecField(dimensions=3, null=True) + half_embedding = HalfVectorField(dimensions=3, null=True) binary_embedding = FixedBitField(max_length=3, null=True) - sparse_embedding = SparsevecField(dimensions=3, null=True) + sparse_embedding = SparseVectorField(dimensions=3, null=True) class Meta: table_name = 'peewee_item' From 26b69ca54f054cf75e20fa3d1de7c90cb3550bc2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 16:15:27 -0400 Subject: [PATCH 111/424] Added Bit class for Psycopg 3 --- pgvector/psycopg/__init__.py | 9 ++++++++- pgvector/psycopg/bit.py | 31 +++++++++++++++++++++++++++++++ pgvector/utils/__init__.py | 1 + pgvector/utils/bit.py | 34 ++++++++++++++++++++++++++++++++++ tests/test_psycopg.py | 31 ++++++++++++++++++------------- 5 files changed, 92 insertions(+), 14 deletions(-) create mode 100644 pgvector/psycopg/bit.py create mode 100644 pgvector/utils/bit.py diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index e47d699..90ff8b7 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,9 +1,10 @@ import psycopg from psycopg.types import TypeInfo +from .bit import register_bit_info from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import HalfVector, SparseVector +from ..utils import Bit, HalfVector, SparseVector __all__ = ['register_vector'] @@ -13,6 +14,9 @@ def register_vector(context): info = TypeInfo.fetch(context, 'vector') register_vector_info(context, info) + info = TypeInfo.fetch(context, 'bit') + register_bit_info(context, info) + info = TypeInfo.fetch(context, 'halfvec') if info is not None: register_halfvec_info(context, info) @@ -26,6 +30,9 @@ async def register_vector_async(context): info = await TypeInfo.fetch(context, 'vector') register_vector_info(context, info) + info = await TypeInfo.fetch(context, 'bit') + register_bit_info(context, info) + info = await TypeInfo.fetch(context, 'halfvec') if info is not None: register_halfvec_info(context, info) diff --git a/pgvector/psycopg/bit.py b/pgvector/psycopg/bit.py new file mode 100644 index 0000000..80cfac0 --- /dev/null +++ b/pgvector/psycopg/bit.py @@ -0,0 +1,31 @@ +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from ..utils import Bit + + +class BitDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return Bit.to_db(obj).encode('utf8') + + +class BitBinaryDumper(BitDumper): + + format = Format.BINARY + + def dump(self, obj): + return Bit.to_db_binary(obj) + + +def register_bit_info(context, info): + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (BitDumper,), {'oid': info.oid}) + binary_dumper = type('', (BitBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper(Bit, text_dumper) + adapters.register_dumper(Bit, binary_dumper) diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 897862b..69e8cfb 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,3 +1,4 @@ +from .bit import * from .halfvec import * from .sparsevec import * from .vector import * diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py new file mode 100644 index 0000000..ef639ba --- /dev/null +++ b/pgvector/utils/bit.py @@ -0,0 +1,34 @@ +import numpy as np +from struct import pack, unpack_from + + +class Bit: + def __init__(self, value): + if isinstance(value, bytes): + count = unpack_from('>i', value)[0] + buf = np.frombuffer(value[4:], dtype=np.uint8) + self.value = np.unpackbits(buf, count=count).astype(bool) + elif isinstance(value, str): + self.value = np.array([v != '0' for v in value], dtype=bool) + else: + self.value = np.array(value, dtype=bool) + + def to_db(value): + if not isinstance(value, Bit): + raise ValueError('expected bit') + + value = value.value + return ''.join(value.astype(np.uint8).astype(str)) + + def to_db_binary(value): + if not isinstance(value, Bit): + raise ValueError('expected bit') + + value = value.value + return pack('>i', len(value)) + np.packbits(value).tobytes() + + def __str__(self): + return self.__class__.to_db(self) + + def __repr__(self): + return f'Bit({self})' diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index aeca9d9..cba62fd 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg import register_vector, register_vector_async, HalfVector, SparseVector +from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector import psycopg import pytest @@ -90,6 +90,23 @@ def test_halfvec_text_format(self): res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res.to_list() == [1.5, 2, 3] + def test_bit(self): + embedding = Bit([False, True, False, True, False, False, False, False, True]) + res = conn.execute('SELECT %s::bit(9)', (embedding,)).fetchone()[0] + assert res == '010100001' + assert str(Bit(res)) == '010100001' + + def test_bit_binary_format(self): + embedding = Bit([False, True, False, True, False, False, False, False, True]) + res = conn.execute('SELECT %b::bit(9)', (embedding,), binary=True).fetchone()[0] + assert str(Bit(res)) == '010100001' + assert repr(Bit(res)) == 'Bit(010100001)' + + def test_bit_text_format(self): + embedding = Bit([False, True, False, True, False, False, False, False, True]) + res = conn.execute('SELECT %t::bit(9)', (embedding,)).fetchone()[0] + assert res == '010100001' + def test_sparsevec(self): conn.execute('DROP TABLE IF EXISTS sparse_items') conn.execute('CREATE TABLE sparse_items (id bigserial PRIMARY KEY, embedding sparsevec(6))') @@ -110,18 +127,6 @@ def test_sparsevec_text_format(self): res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res.to_dense() == [1.5, 2, 3] - def test_bit(self): - res = conn.execute('SELECT %s::bit(3)', ('101',)).fetchone()[0] - assert res == '101' - - def test_bit_binary_format(self): - res = conn.execute('SELECT %b::bit(3)', ('101',), binary=True).fetchone()[0] - assert res == b'\x00\x00\x00\x03\xa0' - - def test_bit_text_format(self): - res = conn.execute('SELECT %t::bit(3)', ('101',)).fetchone()[0] - assert res == '101' - @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) From bb060c87a8d23c67f78a5897e855a02dceecfa0d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 20:34:28 -0400 Subject: [PATCH 112/424] Moved copy tests [skip ci] --- tests/test_psycopg.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index cba62fd..c823eac 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -52,25 +52,6 @@ def test_vector_binary_format_non_contiguous(self): res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] assert np.array_equal(res, np.array([3, 2, 1.5])) - def test_text_copy(self): - embedding = np.array([1.5, 2, 3]) - cur = conn.cursor() - with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: - copy.write_row([embedding, HalfVector(embedding), '101', SparseVector.from_dense(embedding)]) - - def test_binary_copy(self): - embedding = np.array([1.5, 2, 3]) - cur = conn.cursor() - with cur.copy("COPY psycopg_items (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.write_row([embedding]) - - def test_binary_copy_set_types(self): - embedding = np.array([1.5, 2, 3]) - cur = conn.cursor() - with cur.copy("COPY psycopg_items (id, embedding, half_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.set_types(['int8', 'vector', 'halfvec', 'sparsevec']) - copy.write_row([1, embedding, HalfVector(embedding), SparseVector.from_dense(embedding)]) - def test_halfvec(self): conn.execute('DROP TABLE IF EXISTS half_items') conn.execute('CREATE TABLE half_items (id bigserial PRIMARY KEY, embedding halfvec(3))') @@ -127,6 +108,25 @@ def test_sparsevec_text_format(self): res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res.to_dense() == [1.5, 2, 3] + def test_text_copy(self): + embedding = np.array([1.5, 2, 3]) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: + copy.write_row([embedding, HalfVector(embedding), '101', SparseVector.from_dense(embedding)]) + + def test_binary_copy(self): + embedding = np.array([1.5, 2, 3]) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.write_row([embedding]) + + def test_binary_copy_set_types(self): + embedding = np.array([1.5, 2, 3]) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (id, embedding, half_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(['int8', 'vector', 'halfvec', 'sparsevec']) + copy.write_row([1, embedding, HalfVector(embedding), SparseVector.from_dense(embedding)]) + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) From 950551ac3813af132849c5070f89ccc195bf67d1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 20:35:04 -0400 Subject: [PATCH 113/424] Added bit type to binary copy test --- tests/test_psycopg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index c823eac..790d71e 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -123,9 +123,9 @@ def test_binary_copy(self): def test_binary_copy_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY psycopg_items (id, embedding, half_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.set_types(['int8', 'vector', 'halfvec', 'sparsevec']) - copy.write_row([1, embedding, HalfVector(embedding), SparseVector.from_dense(embedding)]) + with cur.copy("COPY psycopg_items (id, embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(['int8', 'vector', 'halfvec', 'bit', 'sparsevec']) + copy.write_row([1, embedding, HalfVector(embedding), Bit('101'), SparseVector.from_dense(embedding)]) @pytest.mark.asyncio async def test_async(self): From e341c4d623ecffad8df23301f2e000b26f5aa126 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 20:43:00 -0400 Subject: [PATCH 114/424] Improved Psycopg 3 tests --- tests/test_psycopg.py | 46 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 790d71e..c06dacd 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -20,10 +20,10 @@ def test_vector(self): embedding = np.array([1.5, 2, 3]) conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s), (NULL)', (embedding,)) - res = conn.execute('SELECT * FROM psycopg_items ORDER BY id').fetchall() - assert np.array_equal(res[0][1], embedding) - assert res[0][1].dtype == np.float32 - assert res[1][1] is None + res = conn.execute('SELECT embedding FROM psycopg_items ORDER BY id').fetchall() + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None def test_vector_binary_format(self): embedding = np.array([1.5, 2, 3]) @@ -53,13 +53,11 @@ def test_vector_binary_format_non_contiguous(self): assert np.array_equal(res, np.array([3, 2, 1.5])) def test_halfvec(self): - conn.execute('DROP TABLE IF EXISTS half_items') - conn.execute('CREATE TABLE half_items (id bigserial PRIMARY KEY, embedding halfvec(3))') - embedding = HalfVector([1.5, 2, 3]) - conn.execute('INSERT INTO half_items (embedding) VALUES (%s)', (embedding,)) + conn.execute('INSERT INTO psycopg_items (half_embedding) VALUES (%s)', (embedding,)) - res = conn.execute('SELECT * FROM half_items ORDER BY id').fetchall() + res = conn.execute('SELECT half_embedding FROM psycopg_items ORDER BY id').fetchone()[0] + assert res.to_list() == [1.5, 2, 3] def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) @@ -72,10 +70,11 @@ def test_halfvec_text_format(self): assert res.to_list() == [1.5, 2, 3] def test_bit(self): - embedding = Bit([False, True, False, True, False, False, False, False, True]) - res = conn.execute('SELECT %s::bit(9)', (embedding,)).fetchone()[0] - assert res == '010100001' - assert str(Bit(res)) == '010100001' + embedding = Bit([True, False, True]) + conn.execute('INSERT INTO psycopg_items (binary_embedding) VALUES (%s)', (embedding,)) + + res = conn.execute('SELECT binary_embedding FROM psycopg_items ORDER BY id').fetchone()[0] + assert res == '101' def test_bit_binary_format(self): embedding = Bit([False, True, False, True, False, False, False, False, True]) @@ -87,26 +86,25 @@ def test_bit_text_format(self): embedding = Bit([False, True, False, True, False, False, False, False, True]) res = conn.execute('SELECT %t::bit(9)', (embedding,)).fetchone()[0] assert res == '010100001' + assert str(Bit(res)) == '010100001' + assert repr(Bit(res)) == 'Bit(010100001)' def test_sparsevec(self): - conn.execute('DROP TABLE IF EXISTS sparse_items') - conn.execute('CREATE TABLE sparse_items (id bigserial PRIMARY KEY, embedding sparsevec(6))') - - embedding = SparseVector.from_dense([0, 1.5, 0, 2, 0, 3]) - conn.execute('INSERT INTO sparse_items (embedding) VALUES (%s)', (embedding,)) + embedding = SparseVector.from_dense([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (sparse_embedding) VALUES (%s)', (embedding,)) - res = conn.execute('SELECT * FROM sparse_items ORDER BY id').fetchall() - assert res[0][1].to_dense() == [0, 1.5, 0, 2, 0, 3] + res = conn.execute('SELECT sparse_embedding FROM psycopg_items ORDER BY id').fetchone()[0] + assert res.to_dense() == [1.5, 2, 3] def test_sparsevec_binary_format(self): - embedding = SparseVector.from_dense([1.5, 2, 3]) + embedding = SparseVector.from_dense([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] - assert res.to_dense() == [1.5, 2, 3] + assert res.to_dense() == [1.5, 0, 2, 0, 3, 0] def test_sparsevec_text_format(self): - embedding = SparseVector.from_dense([1.5, 2, 3]) + embedding = SparseVector.from_dense([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] - assert res.to_dense() == [1.5, 2, 3] + assert res.to_dense() == [1.5, 0, 2, 0, 3, 0] def test_text_copy(self): embedding = np.array([1.5, 2, 3]) From 0d6260a328f3395456b619a80905a67d3bb160de Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 20:44:30 -0400 Subject: [PATCH 115/424] Improved test [skip ci] --- tests/test_psycopg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index c06dacd..a11b73c 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -115,8 +115,8 @@ def test_text_copy(self): def test_binary_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY psycopg_items (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.write_row([embedding]) + with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.write_row([embedding, HalfVector(embedding), Bit('101'), SparseVector.from_dense(embedding)]) def test_binary_copy_set_types(self): embedding = np.array([1.5, 2, 3]) From a8e971a9f7e1f22783b0bed2c837664434e1bfaa Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 20:59:28 -0400 Subject: [PATCH 116/424] Improved tests [skip ci] --- tests/test_django.py | 19 +++++-------------- tests/test_peewee.py | 18 +++++------------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index b5c67e4..2652e39 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -88,14 +88,9 @@ class Migration(migrations.Migration): def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] - for i, v in enumerate(vectors): - item = Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v)) - item.save() + Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1])).save() + Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2])).save() + Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2])).save() class ItemForm(ModelForm): @@ -222,18 +217,14 @@ def test_sparsevec_l1_distance(self): assert [v.distance for v in items] == [0, 1, 3] def test_bit_hamming_distance(self): - Item(id=1, binary_embedding='000').save() - Item(id=2, binary_embedding='101').save() - Item(id=3, binary_embedding='111').save() + create_items() distance = HammingDistance('binary_embedding', '101') items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [0, 1, 2] def test_bit_jaccard_distance(self): - Item(id=1, binary_embedding='000').save() - Item(id=2, binary_embedding='101').save() - Item(id=3, binary_embedding='111').save() + create_items() distance = JaccardDistance('binary_embedding', '101') items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] diff --git a/tests/test_peewee.py b/tests/test_peewee.py index d83ef6f..a5f3d97 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -30,13 +30,9 @@ class Meta: def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] - for i, v in enumerate(vectors): - Item.create(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v)) + Item.create(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1])) + Item.create(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2])) + Item.create(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2])) class TestPeewee: @@ -134,18 +130,14 @@ def test_sparsevec_l1_distance(self): assert [v.distance for v in items] == [0, 1, 3] def test_bit_hamming_distance(self): - Item.create(id=1, binary_embedding='000') - Item.create(id=2, binary_embedding='101') - Item.create(id=3, binary_embedding='111') + create_items() distance = Item.binary_embedding.hamming_distance('101') items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [0, 1, 2] def test_bit_jaccard_distance(self): - Item.create(id=1, binary_embedding='000') - Item.create(id=2, binary_embedding='101') - Item.create(id=3, binary_embedding='111') + create_items() distance = Item.binary_embedding.jaccard_distance('101') items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [2, 3, 1] From ad7a1863e208c4507f0d96f71aa4c16edee6c777 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:01:12 -0400 Subject: [PATCH 117/424] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 9378c82..c0fdb5c 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -39,14 +39,10 @@ class Item(Base): def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] session = Session(engine) - for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v))) + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2]))) session.commit() @@ -174,21 +170,13 @@ def test_sparsevec_l2_distance(self): assert [v.id for v in items] == [1, 3, 2] def test_bit_hamming_distance(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='000')) - session.add(Item(id=2, binary_embedding='101')) - session.add(Item(id=3, binary_embedding='111')) - session.commit() + create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] def test_bit_jaccard_distance(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='000')) - session.add(Item(id=2, binary_embedding='101')) - session.add(Item(id=3, binary_embedding='111')) - session.commit() + create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] From ff129253971b2923ddf53178663236ac5f8f2a4f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:02:58 -0400 Subject: [PATCH 118/424] Improved tests [skip ci] --- tests/test_sqlmodel.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 86f0110..c527a18 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -36,14 +36,10 @@ class Item(SQLModel, table=True): def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] session = Session(engine) - for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v, half_embedding=v, sparse_embedding=SparseVector.from_dense(v))) + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2]))) session.commit() @@ -149,21 +145,13 @@ def test_sparsevec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] def test_bit_hamming_distance(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='000')) - session.add(Item(id=2, binary_embedding='101')) - session.add(Item(id=3, binary_embedding='111')) - session.commit() + create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) assert [v.id for v in items] == [2, 3, 1] def test_bit_jaccard_distance(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='000')) - session.add(Item(id=2, binary_embedding='101')) - session.add(Item(id=3, binary_embedding='111')) - session.commit() + create_items() with Session(engine) as session: items = session.exec(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) assert [v.id for v in items] == [2, 3, 1] From ce48133aacc9073adedff88863912402f40eeeea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:07:50 -0400 Subject: [PATCH 119/424] Improved table names [skip ci] --- tests/test_django.py | 6 +++--- tests/test_sqlalchemy.py | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 2652e39..a0c837c 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -29,7 +29,7 @@ class Item(models.Model): sparse_embedding = SparseVectorField(dimensions=3, null=True, blank=True) class Meta: - app_label = 'myapp' + app_label = 'django_app' indexes = [ IvfflatIndex( name='ivfflat_idx', @@ -77,9 +77,9 @@ class Migration(migrations.Migration): # probably a better way to do this -migration = Migration('initial', 'myapp') +migration = Migration('initial', 'django_app') loader = MigrationLoader(connection, replace_migrations=False) -loader.graph.add_node(('myapp', migration.name), migration) +loader.graph.add_node(('django_app', migration.name), migration) sql_statements = loader.collect_sql([(migration, False)]) with connection.cursor() as cursor: diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index c0fdb5c..b03e6a0 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -16,7 +16,7 @@ class Item(Base): - __tablename__ = 'orm_item' + __tablename__ = 'sqlalchemy_orm_item' id = mapped_column(Integer, primary_key=True) embedding = mapped_column(Vector(3)) @@ -29,7 +29,7 @@ class Item(Base): Base.metadata.create_all(engine) index = Index( - 'orm_index', + 'sqlalchemy_orm_index', Item.embedding, postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, @@ -56,7 +56,7 @@ def test_core(self): metadata = MetaData() item_table = Table( - 'core_item', + 'sqlalchemy_core_item', metadata, Column('id', Integer, primary_key=True), Column('embedding', Vector(3)), @@ -69,7 +69,7 @@ def test_core(self): metadata.create_all(engine) ivfflat_index = Index( - 'ivfflat_core_index', + 'sqlalchemy_core_ivfflat_index', item_table.c.embedding, postgresql_using='ivfflat', postgresql_with={'lists': 1}, @@ -78,7 +78,7 @@ def test_core(self): ivfflat_index.create(engine) hnsw_index = Index( - 'hnsw_core_index', + 'sqlalchemy_core_hnsw_index', item_table.c.embedding, postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, @@ -263,7 +263,7 @@ def test_bad_dtype(self): session.commit() def test_inspect(self): - columns = inspect(engine).get_columns('orm_item') + columns = inspect(engine).get_columns('sqlalchemy_orm_item') assert isinstance(columns[1]['type'], Vector) def test_literal_binds(self): @@ -277,7 +277,7 @@ def test_insert_bulk(self): session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) def test_insert_text(self): - session.execute(text('INSERT INTO orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) @pytest.mark.asyncio async def test_async(self): From 2fb168420685a5ec679e28810e2796bb2b963f37 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:08:23 -0400 Subject: [PATCH 120/424] Fixed table name [skip ci] --- tests/test_django.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_django.py b/tests/test_django.py index a0c837c..9523c65 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -83,7 +83,7 @@ class Migration(migrations.Migration): sql_statements = loader.collect_sql([(migration, False)]) with connection.cursor() as cursor: - cursor.execute("DROP TABLE IF EXISTS myapp_item") + cursor.execute("DROP TABLE IF EXISTS django_app_item") cursor.execute('\n'.join(sql_statements)) From ab048e93906f9709ab7f7325c41bac392f1dfd03 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:18:47 -0400 Subject: [PATCH 121/424] Removed unneeded tests [skip ci] --- tests/test_asyncpg.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 474ba92..4f0a4bd 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -19,8 +19,6 @@ async def test_vector(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 assert np.array_equal(res[0]['embedding'], embedding) assert res[0]['embedding'].dtype == np.float32 assert res[1]['embedding'] is None @@ -44,8 +42,6 @@ async def test_halfvec(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 assert res[0]['embedding'].to_list() == [1.5, 2, 3] assert res[1]['embedding'] is None @@ -68,8 +64,6 @@ async def test_bit(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 assert res[0]['embedding'].to_int() == 5 assert res[1]['embedding'] is None @@ -92,8 +86,6 @@ async def test_sparsevec(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 assert res[0]['embedding'].to_dense() == [1.5, 2, 3] assert res[1]['embedding'] is None @@ -119,8 +111,6 @@ async def init(conn): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 assert np.array_equal(res[0]['embedding'], embedding) assert res[0]['embedding'].dtype == np.float32 assert res[1]['embedding'] is None From 1566eac74e42b8d512bd3ce30c7bfc275995237e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:37:26 -0400 Subject: [PATCH 122/424] Added more tests [skip ci] --- tests/test_peewee.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index a5f3d97..1146234 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -73,6 +73,11 @@ def test_vector_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_halfvec(self): + Item.create(id=1, half_embedding=[1, 2, 3]) + item = Item.get_by_id(1) + assert item.half_embedding.to_list() == [1, 2, 3] + def test_halfvec_l2_distance(self): create_items() distance = Item.half_embedding.l2_distance([1, 1, 1]) @@ -101,6 +106,11 @@ def test_halfvec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_sparsevec(self): + Item.create(id=1, sparse_embedding=[1, 2, 3]) + item = Item.get_by_id(1) + assert item.sparse_embedding.to_dense() == [1, 2, 3] + def test_sparsevec_l2_distance(self): create_items() distance = Item.sparse_embedding.l2_distance(SparseVector.from_dense([1, 1, 1])) @@ -129,6 +139,11 @@ def test_sparsevec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_bit(self): + Item.create(id=1, binary_embedding='101') + item = Item.get_by_id(1) + assert item.binary_embedding == '101' + def test_bit_hamming_distance(self): create_items() distance = Item.binary_embedding.hamming_distance('101') From b2ab8617a1d02c1259bc7e939047f39d1de47e02 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:39:47 -0400 Subject: [PATCH 123/424] Improved tests [skip ci] --- tests/test_django.py | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 9523c65..9f38761 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -104,34 +104,11 @@ def setup_method(self, test_method): Item.objects.all().delete() def test_vector(self): - item = Item(id=1, embedding=[1, 2, 3]) - item.save() + Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - assert item.id == 1 assert np.array_equal(item.embedding, np.array([1, 2, 3])) assert item.embedding.dtype == np.float32 - def test_halfvec(self): - item = Item(id=1, half_embedding=[1, 2, 3]) - item.save() - item = Item.objects.get(pk=1) - assert item.id == 1 - assert item.half_embedding.to_list() == [1, 2, 3] - - def test_bit(self): - item = Item(id=1, binary_embedding='101') - item.save() - item = Item.objects.get(pk=1) - assert item.id == 1 - assert item.binary_embedding == '101' - - def test_sparsevec(self): - item = Item(id=1, sparse_embedding=SparseVector.from_dense([1, 2, 3])) - item.save() - item = Item.objects.get(pk=1) - assert item.id == 1 - assert item.sparse_embedding.to_dense() == [1, 2, 3] - def test_vector_l2_distance(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) @@ -160,6 +137,11 @@ def test_vector_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_halfvec(self): + Item(id=1, half_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + assert item.half_embedding.to_list() == [1, 2, 3] + def test_halfvec_l2_distance(self): create_items() distance = L2Distance('half_embedding', HalfVector([1, 1, 1])) @@ -188,6 +170,11 @@ def test_halfvec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_sparsevec(self): + Item(id=1, sparse_embedding=SparseVector.from_dense([1, 2, 3])).save() + item = Item.objects.get(pk=1) + assert item.sparse_embedding.to_dense() == [1, 2, 3] + def test_sparsevec_l2_distance(self): create_items() distance = L2Distance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) @@ -216,6 +203,11 @@ def test_sparsevec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_bit(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + assert item.binary_embedding == '101' + def test_bit_hamming_distance(self): create_items() distance = HammingDistance('binary_embedding', '101') From 0250d419cf5e2ab483a8c04bf38ed8dc5c27654e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:45:37 -0400 Subject: [PATCH 124/424] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index b03e6a0..56a2539 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -109,6 +109,13 @@ def test_orm(self): assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None + def test_vector(self): + session = Session(engine) + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] + def test_vector_l2_distance(self): create_items() with Session(engine) as session: @@ -157,18 +164,39 @@ def test_vector_l1_distance_orm(self): items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_halfvec(self): + session = Session(engine) + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] + def test_halfvec_l2_distance(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] + def test_sparsevec(self): + session = Session(engine) + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_dense() == [1, 2, 3] + def test_sparsevec_l2_distance(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] + def test_bit(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + def test_bit_hamming_distance(self): create_items() with Session(engine) as session: From 505af704163cce9dcbd56b1491b435e36d282f18 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:48:56 -0400 Subject: [PATCH 125/424] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 96 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 56a2539..174b5a0 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -177,6 +177,48 @@ def test_halfvec_l2_distance(self): items = session.query(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] + def test_halfvec_l2_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1])).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_halfvec_max_inner_product_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_halfvec_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 2, 3] + + def test_halfvec_cosine_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_halfvec_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec_l1_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_sparsevec(self): session = Session(engine) session.add(Item(id=1, sparse_embedding=[1, 2, 3])) @@ -190,6 +232,48 @@ def test_sparsevec_l2_distance(self): items = session.query(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] + def test_sparsevec_l2_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1])).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec_max_inner_product_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 2, 3] + + def test_sparsevec_cosine_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_sparsevec_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_l1_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + def test_bit(self): session = Session(engine) session.add(Item(id=1, binary_embedding='101')) @@ -203,12 +287,24 @@ def test_bit_hamming_distance(self): items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] + def test_bit_hamming_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + def test_bit_jaccard_distance(self): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] + def test_bit_jaccard_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + def test_filter(self): create_items() with Session(engine) as session: From 2c7b585afa976de6bcf2831eea24cf1cbda610f0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:51:07 -0400 Subject: [PATCH 126/424] Improved tests [skip ci] --- tests/test_sqlmodel.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index c527a18..c3716c9 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -72,6 +72,13 @@ def test_orm(self): assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None + def test_vector(self): + session = Session(engine) + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] + def test_vector_l2_distance(self): create_items() with Session(engine) as session: @@ -96,6 +103,13 @@ def test_vector_l1_distance(self): items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_halfvec(self): + session = Session(engine) + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] + def test_halfvec_l2_distance(self): create_items() with Session(engine) as session: @@ -120,6 +134,13 @@ def test_halfvec_l1_distance(self): items = session.exec(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_sparsevec(self): + session = Session(engine) + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_dense() == [1, 2, 3] + def test_sparsevec_l2_distance(self): create_items() with Session(engine) as session: @@ -144,6 +165,13 @@ def test_sparsevec_l1_distance(self): items = session.exec(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_bit(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + def test_bit_hamming_distance(self): create_items() with Session(engine) as session: From 22025d4b8d87f191f500f49395b3908cc85995f9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 20 May 2024 21:54:57 -0400 Subject: [PATCH 127/424] Moved tests [skip ci] --- tests/test_django.py | 38 ++++++++++++------------ tests/test_peewee.py | 38 ++++++++++++------------ tests/test_sqlalchemy.py | 62 ++++++++++++++++++++-------------------- tests/test_sqlmodel.py | 38 ++++++++++++------------ 4 files changed, 88 insertions(+), 88 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 9f38761..b40a47a 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -170,6 +170,25 @@ def test_halfvec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_bit(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + distance = HammingDistance('binary_embedding', '101') + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [0, 1, 2] + + def test_bit_jaccard_distance(self): + create_items() + distance = JaccardDistance('binary_embedding', '101') + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + # assert [v.distance for v in items] == [0, 1/3, 1] + def test_sparsevec(self): Item(id=1, sparse_embedding=SparseVector.from_dense([1, 2, 3])).save() item = Item.objects.get(pk=1) @@ -203,25 +222,6 @@ def test_sparsevec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] - def test_bit(self): - Item(id=1, binary_embedding='101').save() - item = Item.objects.get(pk=1) - assert item.binary_embedding == '101' - - def test_bit_hamming_distance(self): - create_items() - distance = HammingDistance('binary_embedding', '101') - items = Item.objects.annotate(distance=distance).order_by(distance) - assert [v.id for v in items] == [2, 3, 1] - assert [v.distance for v in items] == [0, 1, 2] - - def test_bit_jaccard_distance(self): - create_items() - distance = JaccardDistance('binary_embedding', '101') - items = Item.objects.annotate(distance=distance).order_by(distance) - assert [v.id for v in items] == [2, 3, 1] - # assert [v.distance for v in items] == [0, 1/3, 1] - def test_filter(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 1146234..deb571f 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -106,6 +106,25 @@ def test_halfvec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] + def test_bit(self): + Item.create(id=1, binary_embedding='101') + item = Item.get_by_id(1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + distance = Item.binary_embedding.hamming_distance('101') + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [0, 1, 2] + + def test_bit_jaccard_distance(self): + create_items() + distance = Item.binary_embedding.jaccard_distance('101') + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + # assert [v.distance for v in items] == [0, 1/3, 1] + def test_sparsevec(self): Item.create(id=1, sparse_embedding=[1, 2, 3]) item = Item.get_by_id(1) @@ -139,25 +158,6 @@ def test_sparsevec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] - def test_bit(self): - Item.create(id=1, binary_embedding='101') - item = Item.get_by_id(1) - assert item.binary_embedding == '101' - - def test_bit_hamming_distance(self): - create_items() - distance = Item.binary_embedding.hamming_distance('101') - items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) - assert [v.id for v in items] == [2, 3, 1] - assert [v.distance for v in items] == [0, 1, 2] - - def test_bit_jaccard_distance(self): - create_items() - distance = Item.binary_embedding.jaccard_distance('101') - items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) - assert [v.id for v in items] == [2, 3, 1] - # assert [v.distance for v in items] == [0, 1/3, 1] - def test_where(self): create_items() items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 174b5a0..1fff283 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -219,6 +219,37 @@ def test_halfvec_l1_distance_orm(self): items = session.scalars(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_bit(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_hamming_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance_orm(self): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + def test_sparsevec(self): session = Session(engine) session.add(Item(id=1, sparse_embedding=[1, 2, 3])) @@ -274,37 +305,6 @@ def test_sparsevec_l1_distance_orm(self): items = session.scalars(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_bit(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' - - def test_bit_hamming_distance(self): - create_items() - with Session(engine) as session: - items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() - assert [v.id for v in items] == [2, 3, 1] - - def test_bit_hamming_distance_orm(self): - create_items() - with Session(engine) as session: - items = session.scalars(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) - assert [v.id for v in items] == [2, 3, 1] - - def test_bit_jaccard_distance(self): - create_items() - with Session(engine) as session: - items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() - assert [v.id for v in items] == [2, 3, 1] - - def test_bit_jaccard_distance_orm(self): - create_items() - with Session(engine) as session: - items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) - assert [v.id for v in items] == [2, 3, 1] - def test_filter(self): create_items() with Session(engine) as session: diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index c3716c9..8563bec 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -134,6 +134,25 @@ def test_halfvec_l1_distance(self): items = session.exec(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_bit(self): + session = Session(engine) + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + def test_sparsevec(self): session = Session(engine) session.add(Item(id=1, sparse_embedding=[1, 2, 3])) @@ -165,25 +184,6 @@ def test_sparsevec_l1_distance(self): items = session.exec(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_bit(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' - - def test_bit_hamming_distance(self): - create_items() - with Session(engine) as session: - items = session.exec(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) - assert [v.id for v in items] == [2, 3, 1] - - def test_bit_jaccard_distance(self): - create_items() - with Session(engine) as session: - items = session.exec(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) - assert [v.id for v in items] == [2, 3, 1] - def test_filter(self): create_items() with Session(engine) as session: From 68c0c4d3a242f5df0514f7582cdf8c1ed49f884b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 12:52:58 -0400 Subject: [PATCH 128/424] Added sparse search example [skip ci] --- README.md | 1 + examples/sparse_search.py | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 examples/sparse_search.py diff --git a/README.md b/README.md index 28703a6..710525a 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ Or check out some examples: - [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_embeddings.py) with SentenceTransformers - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) +- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search.py) with Transformers (unreleased) - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing - [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit diff --git a/examples/sparse_search.py b/examples/sparse_search.py new file mode 100644 index 0000000..7e786e3 --- /dev/null +++ b/examples/sparse_search.py @@ -0,0 +1,54 @@ +# good resources +# https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/ +# https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1 + +import numpy as np +from pgvector.psycopg import register_vector, SparseVector +import psycopg +import torch +from transformers import AutoModelForMaskedLM, AutoTokenizer + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))') + +model_id = 'opensearch-project/opensearch-neural-sparse-encoding-v1' +model = AutoModelForMaskedLM.from_pretrained(model_id) +tokenizer = AutoTokenizer.from_pretrained(model_id) +special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()] + + +def fetch_embeddings(input): + feature = tokenizer( + input, + padding=True, + truncation=True, + return_tensors='pt', + return_token_type_ids=False + ) + output = model(**feature)[0] + + values, _ = torch.max(output * feature['attention_mask'].unsqueeze(-1), dim=1) + values = torch.log(1 + torch.relu(values)) + values[:, special_token_ids] = 0 + return values.detach().cpu().numpy() + + +# note: works much better with longer content +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = fetch_embeddings(input) +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector.from_dense(embedding))) + +query = 'forest' +query_embedding = fetch_embeddings([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector.from_dense(query_embedding),)).fetchall() +for row in result: + print(row[0]) From 44855c65d295d4e5380fe49d3ef978bf372915c7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 14:30:39 -0400 Subject: [PATCH 129/424] Added support for Vector class to Psycopg 3 [skip ci] --- pgvector/psycopg/__init__.py | 2 +- pgvector/psycopg/vector.py | 2 ++ pgvector/utils/vector.py | 14 ++++++++++++++ tests/test_psycopg.py | 12 +++++++++++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index 90ff8b7..d9e2a95 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -4,7 +4,7 @@ from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import Bit, HalfVector, SparseVector +from ..utils import Bit, HalfVector, SparseVector, Vector __all__ = ['register_vector'] diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index fe606a6..4875fe2 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -51,5 +51,7 @@ def register_vector_info(context, info): adapters = context.adapters adapters.register_dumper('numpy.ndarray', text_dumper) adapters.register_dumper('numpy.ndarray', binary_dumper) + adapters.register_dumper(Vector, text_dumper) + adapters.register_dumper(Vector, binary_dumper) adapters.register_loader(info.oid, VectorLoader) adapters.register_loader(info.oid, VectorBinaryLoader) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index c172b5e..3f44e0d 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -3,6 +3,15 @@ class Vector: + def __init__(self, value): + if isinstance(value, np.ndarray): + value = value.tolist() + + if not isinstance(value, (list, tuple)): + raise ValueError('expected list or tuple') + + self.value = value + def from_db(value): # could be ndarray if already cast by lower-level driver if value is None or isinstance(value, np.ndarray): @@ -29,6 +38,8 @@ def to_db(value, dim=None): raise ValueError('dtype must be numeric') value = value.tolist() + elif isinstance(value, Vector): + value = value.value if dim is not None and len(value) != dim: raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) @@ -39,6 +50,9 @@ def to_db_binary(value): if value is None: return value + if isinstance(value, Vector): + value = value.value + value = np.asarray(value, dtype='>f') if value.ndim != 1: diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index a11b73c..956a25d 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector +from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector, Vector import psycopg import pytest @@ -52,6 +52,16 @@ def test_vector_binary_format_non_contiguous(self): res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] assert np.array_equal(res, np.array([3, 2, 1.5])) + def test_vector_class_binary_format(self): + embedding = Vector([1.5, 2, 3]) + res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] + assert np.array_equal(res, np.array([1.5, 2, 3])) + + def test_vector_class_text_format(self): + embedding = Vector([1.5, 2, 3]) + res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] + assert np.array_equal(res, np.array([1.5, 2, 3])) + def test_halfvec(self): embedding = HalfVector([1.5, 2, 3]) conn.execute('INSERT INTO psycopg_items (half_embedding) VALUES (%s)', (embedding,)) From e5a8284486c09fb92445e06faec412458247e670 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 14:51:10 -0400 Subject: [PATCH 130/424] Improved __all__ [skip ci] --- pgvector/asyncpg/__init__.py | 6 +++++- pgvector/psycopg/__init__.py | 8 +++++++- pgvector/psycopg2/__init__.py | 8 ++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 2252c7a..67d6f5c 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,6 +1,10 @@ from ..utils import Vector, HalfVector, SparseVector -__all__ = ['register_vector'] +__all__ = [ + 'register_vector', + 'HalfVector', + 'SparseVector' +] async def register_vector(conn): diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index d9e2a95..20ff68f 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -7,7 +7,13 @@ from ..utils import Bit, HalfVector, SparseVector, Vector -__all__ = ['register_vector'] +__all__ = [ + 'register_vector', + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] def register_vector(context): diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 9d0473a..7f34d7c 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -2,9 +2,13 @@ from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info -from ..utils import SparseVector +from ..utils import HalfVector, SparseVector -__all__ = ['register_vector'] +__all__ = [ + 'register_vector', + 'HalfVector', + 'SparseVector' +] def register_vector(conn_or_curs=None): From 6f8a2cfdd849ea917272efefea69f1163e209ba2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 14:53:27 -0400 Subject: [PATCH 131/424] Improved code [skip ci] --- pgvector/asyncpg/__init__.py | 24 +-------------------- pgvector/asyncpg/register.py | 24 +++++++++++++++++++++ pgvector/psycopg/__init__.py | 41 ++---------------------------------- pgvector/psycopg/register.py | 38 +++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 62 deletions(-) create mode 100644 pgvector/asyncpg/register.py create mode 100644 pgvector/psycopg/register.py diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 67d6f5c..f4e8754 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,3 +1,4 @@ +from .register import register_vector from ..utils import Vector, HalfVector, SparseVector __all__ = [ @@ -5,26 +6,3 @@ 'HalfVector', 'SparseVector' ] - - -async def register_vector(conn): - await conn.set_type_codec( - 'vector', - encoder=Vector.to_db_binary, - decoder=Vector.from_db_binary, - format='binary' - ) - - await conn.set_type_codec( - 'halfvec', - encoder=HalfVector.to_db_binary, - decoder=HalfVector.from_db_binary, - format='binary' - ) - - await conn.set_type_codec( - 'sparsevec', - encoder=SparseVector.to_db_binary, - decoder=SparseVector.from_db_binary, - format='binary' - ) diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py new file mode 100644 index 0000000..4664218 --- /dev/null +++ b/pgvector/asyncpg/register.py @@ -0,0 +1,24 @@ +from ..utils import Vector, HalfVector, SparseVector + + +async def register_vector(conn): + await conn.set_type_codec( + 'vector', + encoder=Vector.to_db_binary, + decoder=Vector.from_db_binary, + format='binary' + ) + + await conn.set_type_codec( + 'halfvec', + encoder=HalfVector.to_db_binary, + decoder=HalfVector.from_db_binary, + format='binary' + ) + + await conn.set_type_codec( + 'sparsevec', + encoder=SparseVector.to_db_binary, + decoder=SparseVector.from_db_binary, + format='binary' + ) diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index 20ff68f..9007c37 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,48 +1,11 @@ -import psycopg -from psycopg.types import TypeInfo -from .bit import register_bit_info -from .halfvec import register_halfvec_info -from .sparsevec import register_sparsevec_info -from .vector import register_vector_info +from .register import register_vector, register_vector_async from ..utils import Bit, HalfVector, SparseVector, Vector - __all__ = [ 'register_vector', + 'register_vector_async', 'Vector', 'HalfVector', 'Bit', 'SparseVector' ] - - -def register_vector(context): - info = TypeInfo.fetch(context, 'vector') - register_vector_info(context, info) - - info = TypeInfo.fetch(context, 'bit') - register_bit_info(context, info) - - info = TypeInfo.fetch(context, 'halfvec') - if info is not None: - register_halfvec_info(context, info) - - info = TypeInfo.fetch(context, 'sparsevec') - if info is not None: - register_sparsevec_info(context, info) - - -async def register_vector_async(context): - info = await TypeInfo.fetch(context, 'vector') - register_vector_info(context, info) - - info = await TypeInfo.fetch(context, 'bit') - register_bit_info(context, info) - - info = await TypeInfo.fetch(context, 'halfvec') - if info is not None: - register_halfvec_info(context, info) - - info = await TypeInfo.fetch(context, 'sparsevec') - if info is not None: - register_sparsevec_info(context, info) diff --git a/pgvector/psycopg/register.py b/pgvector/psycopg/register.py new file mode 100644 index 0000000..7f54a31 --- /dev/null +++ b/pgvector/psycopg/register.py @@ -0,0 +1,38 @@ +import psycopg +from psycopg.types import TypeInfo +from .bit import register_bit_info +from .halfvec import register_halfvec_info +from .sparsevec import register_sparsevec_info +from .vector import register_vector_info + + +def register_vector(context): + info = TypeInfo.fetch(context, 'vector') + register_vector_info(context, info) + + info = TypeInfo.fetch(context, 'bit') + register_bit_info(context, info) + + info = TypeInfo.fetch(context, 'halfvec') + if info is not None: + register_halfvec_info(context, info) + + info = TypeInfo.fetch(context, 'sparsevec') + if info is not None: + register_sparsevec_info(context, info) + + +async def register_vector_async(context): + info = await TypeInfo.fetch(context, 'vector') + register_vector_info(context, info) + + info = await TypeInfo.fetch(context, 'bit') + register_bit_info(context, info) + + info = await TypeInfo.fetch(context, 'halfvec') + if info is not None: + register_halfvec_info(context, info) + + info = await TypeInfo.fetch(context, 'sparsevec') + if info is not None: + register_sparsevec_info(context, info) From db250ee221ebf8aab8933ba37a48c1c6826a33a8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 15:00:05 -0400 Subject: [PATCH 132/424] Improved code [skip ci] --- pgvector/psycopg2/__init__.py | 27 +-------------------------- pgvector/psycopg2/register.py | 26 ++++++++++++++++++++++++++ pgvector/sqlalchemy/__init__.py | 11 +++++++++-- 3 files changed, 36 insertions(+), 28 deletions(-) create mode 100644 pgvector/psycopg2/register.py diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 7f34d7c..7c95295 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,7 +1,4 @@ -import psycopg2 -from .halfvec import register_halfvec_info -from .sparsevec import register_sparsevec_info -from .vector import register_vector_info +from .register import register_vector from ..utils import HalfVector, SparseVector __all__ = [ @@ -9,25 +6,3 @@ 'HalfVector', 'SparseVector' ] - - -def register_vector(conn_or_curs=None): - cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs - - try: - cur.execute('SELECT NULL::vector') - register_vector_info(cur.description[0][1]) - except psycopg2.errors.UndefinedObject: - raise psycopg2.ProgrammingError('vector type not found in the database') - - try: - cur.execute('SELECT NULL::halfvec') - register_halfvec_info(cur.description[0][1]) - except psycopg2.errors.UndefinedObject: - pass - - try: - cur.execute('SELECT NULL::sparsevec') - register_sparsevec_info(cur.description[0][1]) - except psycopg2.errors.UndefinedObject: - pass diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py new file mode 100644 index 0000000..0ffd461 --- /dev/null +++ b/pgvector/psycopg2/register.py @@ -0,0 +1,26 @@ +import psycopg2 +from .halfvec import register_halfvec_info +from .sparsevec import register_sparsevec_info +from .vector import register_vector_info + + +def register_vector(conn_or_curs=None): + cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs + + try: + cur.execute('SELECT NULL::vector') + register_vector_info(cur.description[0][1]) + except psycopg2.errors.UndefinedObject: + raise psycopg2.ProgrammingError('vector type not found in the database') + + try: + cur.execute('SELECT NULL::halfvec') + register_halfvec_info(cur.description[0][1]) + except psycopg2.errors.UndefinedObject: + pass + + try: + cur.execute('SELECT NULL::sparsevec') + register_sparsevec_info(cur.description[0][1]) + except psycopg2.errors.UndefinedObject: + pass diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index fde5752..355a2d7 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -2,6 +2,13 @@ from .halfvec import Halfvec from .sparsevec import Sparsevec from .vector import Vector -from ..utils import SparseVector +from ..utils import HalfVector, SparseVector -__all__ = ['Vector', 'Halfvec', 'Bit', 'Sparsevec'] +__all__ = [ + 'Vector', + 'Halfvec', + 'Bit', + 'Sparsevec', + 'HalfVector', + 'SparseVector' +] From e0c84e4c0f8def287aa45327bf809fc2b8760abe Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 15:03:54 -0400 Subject: [PATCH 133/424] Improved code [skip ci] --- pgvector/utils/sparsevec.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 0c6885c..6860740 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -2,15 +2,6 @@ from struct import pack, unpack_from -def to_db_value(value): - if isinstance(value, SparseVector): - return value - elif isinstance(value, (list, np.ndarray)): - return SparseVector.from_dense(value) - else: - raise ValueError('expected sparsevec') - - class SparseVector: def __init__(self, dim, indices, values): self.dim = dim @@ -35,7 +26,7 @@ def to_db(value, dim=None): if value is None: return value - value = to_db_value(value) + value = __class__.to_db_value(value) if dim is not None and value.dim != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim)) @@ -46,10 +37,18 @@ def to_db_binary(value): if value is None: return value - value = to_db_value(value) + value = __class__.to_db_value(value) nnz = len(value.indices) return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) + def to_db_value(value): + if isinstance(value, SparseVector): + return value + elif isinstance(value, (list, np.ndarray)): + return SparseVector.from_dense(value) + else: + raise ValueError('expected sparsevec') + def from_db(value): if value is None or isinstance(value, SparseVector): return value From 1aec1c0e17a0dbe0617436b45626dee84a84a4d7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 15:07:31 -0400 Subject: [PATCH 134/424] Improved code [skip ci] --- pgvector/utils/bit.py | 12 ++++++------ pgvector/utils/halfvec.py | 6 +++--- pgvector/utils/sparsevec.py | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index ef639ba..88dfe4a 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -13,6 +13,12 @@ def __init__(self, value): else: self.value = np.array(value, dtype=bool) + def __str__(self): + return self.__class__.to_db(self) + + def __repr__(self): + return f'Bit({self})' + def to_db(value): if not isinstance(value, Bit): raise ValueError('expected bit') @@ -26,9 +32,3 @@ def to_db_binary(value): value = value.value return pack('>i', len(value)) + np.packbits(value).tobytes() - - def __str__(self): - return self.__class__.to_db(self) - - def __repr__(self): - return f'Bit({self})' diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index b977291..734bb25 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -15,6 +15,9 @@ def __init__(self, value): def to_list(self): return list(self.value) + def __repr__(self): + return f'HalfVector({self.value})' + def to_db(value, dim=None): if value is None: return value @@ -43,6 +46,3 @@ def from_db_binary(value): return value dim, unused = unpack_from('>HH', value) return HalfVector(unpack_from(f'>{dim}e', value, 4)) - - def __repr__(self): - return f'HalfVector({self.value})' diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 6860740..79ac80c 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -22,6 +22,9 @@ def to_dense(self): vec[i] = v return vec + def __repr__(self): + return f'SparseVector({self.dim}, {self.indices}, {self.values})' + def to_db(value, dim=None): if value is None: return value @@ -68,6 +71,3 @@ def from_db_binary(value): indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) return SparseVector(int(dim), indices, values) - - def __repr__(self): - return f'SparseVector({self.dim}, {self.indices}, {self.values})' From 739afcd7c790edf62d67f4391a0919c678e73ac1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 15:07:56 -0400 Subject: [PATCH 135/424] Revert "Improved code [skip ci]" This reverts commit e0c84e4c0f8def287aa45327bf809fc2b8760abe. --- pgvector/utils/sparsevec.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 79ac80c..cbd37c7 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -2,6 +2,15 @@ from struct import pack, unpack_from +def to_db_value(value): + if isinstance(value, SparseVector): + return value + elif isinstance(value, (list, np.ndarray)): + return SparseVector.from_dense(value) + else: + raise ValueError('expected sparsevec') + + class SparseVector: def __init__(self, dim, indices, values): self.dim = dim @@ -29,7 +38,7 @@ def to_db(value, dim=None): if value is None: return value - value = __class__.to_db_value(value) + value = to_db_value(value) if dim is not None and value.dim != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim)) @@ -40,18 +49,10 @@ def to_db_binary(value): if value is None: return value - value = __class__.to_db_value(value) + value = to_db_value(value) nnz = len(value.indices) return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) - def to_db_value(value): - if isinstance(value, SparseVector): - return value - elif isinstance(value, (list, np.ndarray)): - return SparseVector.from_dense(value) - else: - raise ValueError('expected sparsevec') - def from_db(value): if value is None or isinstance(value, SparseVector): return value From c13d9bc5d52a401917290c21d7188003655a0fd6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 31 May 2024 19:23:32 -0700 Subject: [PATCH 136/424] Improved code [skip ci] --- pgvector/psycopg2/halfvec.py | 2 +- pgvector/psycopg2/sparsevec.py | 2 +- pgvector/psycopg2/vector.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index e5ec111..b480911 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -2,7 +2,7 @@ from ..utils import HalfVector -class HalfvecAdapter(object): +class HalfvecAdapter: def __init__(self, value): self._value = value diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index 4bcdd92..31e86f3 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -2,7 +2,7 @@ from ..utils import SparseVector -class SparsevecAdapter(object): +class SparsevecAdapter: def __init__(self, value): self._value = value diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index c0b066a..4fdb077 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -3,7 +3,7 @@ from ..utils import Vector -class VectorAdapter(object): +class VectorAdapter: def __init__(self, value): self._value = value From ba45eb23dd2eda0e8e15c4b5874c2cba5a577f81 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 14:55:50 -0700 Subject: [PATCH 137/424] Switched to uppercase for SQLAlchemy data types --- pgvector/sqlalchemy/__init__.py | 16 +++++++++------- pgvector/sqlalchemy/bit.py | 4 ++-- pgvector/sqlalchemy/halfvec.py | 4 ++-- pgvector/sqlalchemy/sparsevec.py | 4 ++-- pgvector/sqlalchemy/vector.py | 12 ++++++------ tests/test_sqlalchemy.py | 20 ++++++++++---------- tests/test_sqlmodel.py | 10 +++++----- 7 files changed, 36 insertions(+), 34 deletions(-) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 355a2d7..67b1d16 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,14 +1,16 @@ -from .bit import Bit -from .halfvec import Halfvec -from .sparsevec import Sparsevec -from .vector import Vector +from .bit import BIT +from .halfvec import HALFVEC +from .sparsevec import SPARSEVEC +from .vector import VECTOR +from .vector import VECTOR as Vector from ..utils import HalfVector, SparseVector __all__ = [ 'Vector', - 'Halfvec', - 'Bit', - 'Sparsevec', + 'VECTOR' + 'HALFVEC', + 'BIT', + 'SPARSEVEC', 'HalfVector', 'SparseVector' ] diff --git a/pgvector/sqlalchemy/bit.py b/pgvector/sqlalchemy/bit.py index f71c1d0..0f83f3c 100644 --- a/pgvector/sqlalchemy/bit.py +++ b/pgvector/sqlalchemy/bit.py @@ -2,7 +2,7 @@ from sqlalchemy.types import UserDefinedType, Float -class Bit(UserDefinedType): +class BIT(UserDefinedType): cache_ok = True def __init__(self, length=None): @@ -23,4 +23,4 @@ def jaccard_distance(self, other): # for reflection -ischema_names['bit'] = Bit +ischema_names['bit'] = BIT diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py index cafc6c6..1c5f8f1 100644 --- a/pgvector/sqlalchemy/halfvec.py +++ b/pgvector/sqlalchemy/halfvec.py @@ -3,7 +3,7 @@ from ..utils import HalfVector -class Halfvec(UserDefinedType): +class HALFVEC(UserDefinedType): cache_ok = True _string = String() @@ -48,4 +48,4 @@ def l1_distance(self, other): # for reflection -ischema_names['halfvec'] = Halfvec +ischema_names['halfvec'] = HALFVEC diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py index 499dff0..ca5e9aa 100644 --- a/pgvector/sqlalchemy/sparsevec.py +++ b/pgvector/sqlalchemy/sparsevec.py @@ -3,7 +3,7 @@ from ..utils import SparseVector -class Sparsevec(UserDefinedType): +class SPARSEVEC(UserDefinedType): cache_ok = True _string = String() @@ -48,4 +48,4 @@ def l1_distance(self, other): # for reflection -ischema_names['sparsevec'] = Sparsevec +ischema_names['sparsevec'] = SPARSEVEC diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py index 056dcef..b2ce7f7 100644 --- a/pgvector/sqlalchemy/vector.py +++ b/pgvector/sqlalchemy/vector.py @@ -1,9 +1,9 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import Vector as Vec +from ..utils import Vector -class Vector(UserDefinedType): +class VECTOR(UserDefinedType): cache_ok = True _string = String() @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return Vec.to_db(value, self.dim) + return Vector.to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(Vec.to_db(value, self.dim)) + return string_literal_processor(Vector.to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return Vec.from_db(value) + return Vector.from_db(value) return process class comparator_factory(UserDefinedType.Comparator): @@ -48,4 +48,4 @@ def l1_distance(self, other): # for reflection -ischema_names['vector'] = Vector +ischema_names['vector'] = VECTOR diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 1fff283..52b3e36 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError @@ -19,10 +19,10 @@ class Item(Base): __tablename__ = 'sqlalchemy_orm_item' id = mapped_column(Integer, primary_key=True) - embedding = mapped_column(Vector(3)) - half_embedding = mapped_column(Halfvec(3)) - binary_embedding = mapped_column(Bit(3)) - sparse_embedding = mapped_column(Sparsevec(3)) + embedding = mapped_column(VECTOR(3)) + half_embedding = mapped_column(HALFVEC(3)) + binary_embedding = mapped_column(BIT(3)) + sparse_embedding = mapped_column(SPARSEVEC(3)) Base.metadata.drop_all(engine) @@ -59,10 +59,10 @@ def test_core(self): 'sqlalchemy_core_item', metadata, Column('id', Integer, primary_key=True), - Column('embedding', Vector(3)), - Column('half_embedding', Halfvec(3)), - Column('binary_embedding', Bit(3)), - Column('sparse_embedding', Sparsevec(3)) + Column('embedding', VECTOR(3)), + Column('half_embedding', HALFVEC(3)), + Column('binary_embedding', BIT(3)), + Column('sparse_embedding', SPARSEVEC(3)) ) metadata.drop_all(engine) @@ -388,7 +388,7 @@ def test_bad_dtype(self): def test_inspect(self): columns = inspect(engine).get_columns('sqlalchemy_orm_item') - assert isinstance(columns[1]['type'], Vector) + assert isinstance(columns[1]['type'], VECTOR) def test_literal_binds(self): sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(engine, compile_kwargs={'literal_binds': True}) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 8563bec..1b6baaf 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import Vector, Halfvec, Bit, Sparsevec, SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector import pytest from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError @@ -16,10 +16,10 @@ class Item(SQLModel, table=True): __tablename__ = 'sqlmodel_item' id: Optional[int] = Field(default=None, primary_key=True) - embedding: Optional[Any] = Field(default=None, sa_column=Column(Vector(3))) - half_embedding: Optional[Any] = Field(default=None, sa_column=Column(Halfvec(3))) - binary_embedding: Optional[Any] = Field(default=None, sa_column=Column(Bit(3))) - sparse_embedding: Optional[Any] = Field(default=None, sa_column=Column(Sparsevec(3))) + embedding: Optional[Any] = Field(default=None, sa_column=Column(VECTOR(3))) + half_embedding: Optional[Any] = Field(default=None, sa_column=Column(HALFVEC(3))) + binary_embedding: Optional[Any] = Field(default=None, sa_column=Column(BIT(3))) + sparse_embedding: Optional[Any] = Field(default=None, sa_column=Column(SPARSEVEC(3))) SQLModel.metadata.drop_all(engine) From 4010076f7a7dc1641210da10acff2c17e73137b5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 15:57:45 -0700 Subject: [PATCH 138/424] Updated readme [skip ci] --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 710525a..cf39d7e 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,8 @@ class Item(models.Model): embedding = VectorField(dimensions=3) ``` +Also supports `HalfVectorField` (unreleased), `BitField` (unreleased), and `SparseVectorField` (unreleased) + Insert a vector ```python @@ -76,7 +78,7 @@ from pgvector.django import L2Distance Item.objects.order_by(L2Distance('embedding', [3, 1, 2]))[:5] ``` -Also supports `MaxInnerProduct` and `CosineDistance` +Also supports `MaxInnerProduct`, `CosineDistance`, `L1Distance` (unreleased), `HammingDistance` (unreleased), and `JaccardDistance` (unreleased) Get the distance @@ -144,6 +146,8 @@ class Item(Base): embedding = mapped_column(Vector(3)) ``` +Also supports `HALFVEC` (unreleased), `BIT` (unreleased), and `SPARSEVEC` (unreleased) + Insert a vector ```python @@ -158,7 +162,7 @@ Get the nearest neighbors to a vector session.scalars(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5)) ``` -Also supports `max_inner_product` and `cosine_distance` +Also supports `max_inner_product`, `cosine_distance`, `l1_distance` (unreleased), `hamming_distance` (unreleased), and `jaccard_distance` (unreleased) Get the distance @@ -224,6 +228,8 @@ class Item(SQLModel, table=True): embedding: Any = Field(sa_column=Column(Vector(3))) ``` +Also supports `HALFVEC` (unreleased), `BIT` (unreleased), and `SPARSEVEC` (unreleased) + Insert a vector ```python @@ -238,7 +244,7 @@ Get the nearest neighbors to a vector session.exec(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5)) ``` -Also supports `max_inner_product` and `cosine_distance` +Also supports `max_inner_product`, `cosine_distance`, `l1_distance` (unreleased), `hamming_distance` (unreleased), and `jaccard_distance` (unreleased) Get the distance @@ -453,6 +459,8 @@ class Item(BaseModel): embedding = VectorField(dimensions=3) ``` +Also supports `HalfVectorField` (unreleased), `FixedBitField` (unreleased), and `SparseVectorField` (unreleased) + Insert a vector ```python @@ -465,7 +473,7 @@ Get the nearest neighbors to a vector Item.select().order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5) ``` -Also supports `max_inner_product` and `cosine_distance` +Also supports `max_inner_product`, `cosine_distance`, `l1_distance` (unreleased), `hamming_distance` (unreleased), and `jaccard_distance` (unreleased) Get the distance From c1a7312a3509932b2c39e730a7906fd44c86015c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:04:29 -0700 Subject: [PATCH 139/424] Improved bit code [skip ci] --- pgvector/utils/bit.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 88dfe4a..4c0b449 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -7,28 +7,30 @@ def __init__(self, value): if isinstance(value, bytes): count = unpack_from('>i', value)[0] buf = np.frombuffer(value[4:], dtype=np.uint8) - self.value = np.unpackbits(buf, count=count).astype(bool) + self._value = np.unpackbits(buf, count=count).astype(bool) elif isinstance(value, str): - self.value = np.array([v != '0' for v in value], dtype=bool) + self._value = np.array([v != '0' for v in value], dtype=bool) else: - self.value = np.array(value, dtype=bool) + self._value = np.array(value, dtype=bool) def __str__(self): - return self.__class__.to_db(self) + return ''.join(self._value.astype(np.uint8).astype(str)) def __repr__(self): return f'Bit({self})' + def to_binary(self): + value = self._value + return pack('>i', len(value)) + np.packbits(value).tobytes() + def to_db(value): if not isinstance(value, Bit): raise ValueError('expected bit') - value = value.value - return ''.join(value.astype(np.uint8).astype(str)) + return str(value) def to_db_binary(value): if not isinstance(value, Bit): raise ValueError('expected bit') - value = value.value - return pack('>i', len(value)) + np.packbits(value).tobytes() + return value.to_binary() From e93e54ee10759ff608d33c68cd10bb5dd6d9319b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:06:21 -0700 Subject: [PATCH 140/424] Improved bit code [skip ci] --- pgvector/utils/bit.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 4c0b449..878273d 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -14,11 +14,14 @@ def __init__(self, value): self._value = np.array(value, dtype=bool) def __str__(self): - return ''.join(self._value.astype(np.uint8).astype(str)) + return self.to_text() def __repr__(self): return f'Bit({self})' + def to_text(self): + return ''.join(self._value.astype(np.uint8).astype(str)) + def to_binary(self): value = self._value return pack('>i', len(value)) + np.packbits(value).tobytes() @@ -27,7 +30,7 @@ def to_db(value): if not isinstance(value, Bit): raise ValueError('expected bit') - return str(value) + return value.to_text() def to_db_binary(value): if not isinstance(value, Bit): From 1efe8154772c9b154a21164f48a343ecf197a1b8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:10:24 -0700 Subject: [PATCH 141/424] Improved type code [skip ci] --- pgvector/utils/halfvec.py | 10 +++++----- pgvector/utils/sparsevec.py | 22 +++++++++++----------- pgvector/utils/vector.py | 6 +++--- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 734bb25..7f04088 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -10,19 +10,19 @@ def __init__(self, value): if not isinstance(value, (list, tuple)): raise ValueError('expected list or tuple') - self.value = value + self._value = value def to_list(self): - return list(self.value) + return list(self._value) def __repr__(self): - return f'HalfVector({self.value})' + return f'HalfVector({self._value})' def to_db(value, dim=None): if value is None: return value if isinstance(value, HalfVector): - value = value.value + value = value._value if dim is not None and len(value) != dim: raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) @@ -33,7 +33,7 @@ def to_db_binary(value): if value is None: return value if isinstance(value, HalfVector): - value = value.value + value = value._value return pack(f'>HH{len(value)}e', len(value), 0, *value) def from_db(value): diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index cbd37c7..1e09327 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -13,9 +13,9 @@ def to_db_value(value): class SparseVector: def __init__(self, dim, indices, values): - self.dim = dim - self.indices = indices - self.values = values + self._dim = dim + self._indices = indices + self._values = values def from_dense(value): if isinstance(value, np.ndarray): @@ -26,13 +26,13 @@ def from_dense(value): return SparseVector(dim, indices, values) def to_dense(self): - vec = [0] * self.dim - for i, v in zip(self.indices, self.values): + vec = [0] * self._dim + for i, v in zip(self._indices, self._values): vec[i] = v return vec def __repr__(self): - return f'SparseVector({self.dim}, {self.indices}, {self.values})' + return f'SparseVector({self._dim}, {self._indices}, {self._values})' def to_db(value, dim=None): if value is None: @@ -40,18 +40,18 @@ def to_db(value, dim=None): value = to_db_value(value) - if dim is not None and value.dim != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, value.dim)) + if dim is not None and value._dim != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value._dim)) - return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(value.indices, value.values)]) + '}/' + str(value.dim) + return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(value._indices, value._values)]) + '}/' + str(value._dim) def to_db_binary(value): if value is None: return value value = to_db_value(value) - nnz = len(value.indices) - return pack(f'>iii{nnz}i{nnz}f', value.dim, nnz, 0, *value.indices, *value.values) + nnz = len(value._indices) + return pack(f'>iii{nnz}i{nnz}f', value._dim, nnz, 0, *value._indices, *value._values) def from_db(value): if value is None or isinstance(value, SparseVector): diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 3f44e0d..dd67700 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -10,7 +10,7 @@ def __init__(self, value): if not isinstance(value, (list, tuple)): raise ValueError('expected list or tuple') - self.value = value + self._value = value def from_db(value): # could be ndarray if already cast by lower-level driver @@ -39,7 +39,7 @@ def to_db(value, dim=None): value = value.tolist() elif isinstance(value, Vector): - value = value.value + value = value._value if dim is not None and len(value) != dim: raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) @@ -51,7 +51,7 @@ def to_db_binary(value): return value if isinstance(value, Vector): - value = value.value + value = value._value value = np.asarray(value, dtype='>f') From a4bf4aed85d75046eb7e28a0a4feff231c46a96b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:13:52 -0700 Subject: [PATCH 142/424] Improved HalfVector code [skip ci] --- pgvector/utils/halfvec.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 7f04088..9f617ec 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -12,29 +12,39 @@ def __init__(self, value): self._value = value - def to_list(self): - return list(self._value) - def __repr__(self): return f'HalfVector({self._value})' + def to_text(self): + return '[' + ','.join([str(float(v)) for v in self._value]) + ']' + + def to_binary(self): + return pack(f'>HH{len(self._value)}e', len(self._value), 0, *self._value) + + def dim(self): + return len(self._value) + + def to_list(self): + return list(self._value) + def to_db(value, dim=None): if value is None: return value - if isinstance(value, HalfVector): - value = value._value + if not isinstance(value, HalfVector): + value = HalfVector(value) - if dim is not None and len(value) != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) + if dim is not None and value.dim() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) - return '[' + ','.join([str(float(v)) for v in value]) + ']' + return value.to_text() def to_db_binary(value): if value is None: return value - if isinstance(value, HalfVector): - value = value._value - return pack(f'>HH{len(value)}e', len(value), 0, *value) + if not isinstance(value, HalfVector): + value = HalfVector(value) + + return value.to_binary() def from_db(value): if value is None or isinstance(value, HalfVector): From 00ad5852619151edc2d616d996151c2c0e91b807 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:16:13 -0700 Subject: [PATCH 143/424] Improved HalfVector code [skip ci] --- pgvector/utils/halfvec.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 9f617ec..e596fd6 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -27,6 +27,13 @@ def dim(self): def to_list(self): return list(self._value) + def from_text(value): + return HalfVector([float(v) for v in value[1:-1].split(',')]) + + def from_binary(value): + dim, unused = unpack_from('>HH', value) + return HalfVector(unpack_from(f'>{dim}e', value, 4)) + def to_db(value, dim=None): if value is None: return value @@ -49,10 +56,9 @@ def to_db_binary(value): def from_db(value): if value is None or isinstance(value, HalfVector): return value - return HalfVector([float(v) for v in value[1:-1].split(',')]) + return __class__.from_text(value) def from_db_binary(value): if value is None or isinstance(value, HalfVector): return value - dim, unused = unpack_from('>HH', value) - return HalfVector(unpack_from(f'>{dim}e', value, 4)) + return __class__.from_binary(value) From 63ac583a8bac60548eaacaad0f48c45ae5152207 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:21:11 -0700 Subject: [PATCH 144/424] Improved SparseVector code [skip ci] --- pgvector/utils/bit.py | 2 ++ pgvector/utils/halfvec.py | 2 ++ pgvector/utils/sparsevec.py | 51 ++++++++++++++++++++++++------------- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 878273d..7fc75bf 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -26,6 +26,8 @@ def to_binary(self): value = self._value return pack('>i', len(value)) + np.packbits(value).tobytes() + # TODO move rest + def to_db(value): if not isinstance(value, Bit): raise ValueError('expected bit') diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index e596fd6..11ae7d1 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -34,6 +34,8 @@ def from_binary(value): dim, unused = unpack_from('>HH', value) return HalfVector(unpack_from(f'>{dim}e', value, 4)) + # TODO move rest + def to_db(value, dim=None): if value is None: return value diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 1e09327..1b42fef 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -17,6 +17,9 @@ def __init__(self, dim, indices, values): self._indices = indices self._values = values + def __repr__(self): + return f'SparseVector({self._dim}, {self._indices}, {self._values})' + def from_dense(value): if isinstance(value, np.ndarray): value = value.tolist() @@ -31,8 +34,30 @@ def to_dense(self): vec[i] = v return vec - def __repr__(self): - return f'SparseVector({self._dim}, {self._indices}, {self._values})' + def to_text(self): + return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(self._indices, self._values)]) + '}/' + str(self._dim) + + def to_binary(self): + nnz = len(self._indices) + return pack(f'>iii{nnz}i{nnz}f', self._dim, nnz, 0, *self._indices, *self._values) + + def from_text(value): + elements, dim = value.split('/') + indices = [] + values = [] + for e in elements[1:-1].split(','): + i, v = e.split(':') + indices.append(int(i) - 1) + values.append(float(v)) + return SparseVector(int(dim), indices, values) + + def from_binary(value): + dim, nnz, unused = unpack_from('>iii', value) + indices = unpack_from(f'>{nnz}i', value, 12) + values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) + return SparseVector(int(dim), indices, values) + + # TODO move rest def to_db(value, dim=None): if value is None: @@ -43,32 +68,24 @@ def to_db(value, dim=None): if dim is not None and value._dim != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value._dim)) - return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(value._indices, value._values)]) + '}/' + str(value._dim) + return value.to_text() def to_db_binary(value): if value is None: return value value = to_db_value(value) - nnz = len(value._indices) - return pack(f'>iii{nnz}i{nnz}f', value._dim, nnz, 0, *value._indices, *value._values) + + return value.to_binary() def from_db(value): if value is None or isinstance(value, SparseVector): return value - elements, dim = value.split('/') - indices = [] - values = [] - for e in elements[1:-1].split(','): - i, v = e.split(':') - indices.append(int(i) - 1) - values.append(float(v)) - return SparseVector(int(dim), indices, values) + + return __class__.from_text(value) def from_db_binary(value): if value is None or isinstance(value, SparseVector): return value - dim, nnz, unused = unpack_from('>iii', value) - indices = unpack_from(f'>{nnz}i', value, 12) - values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) - return SparseVector(int(dim), indices, values) + + return __class__.from_binary(value) From 585045976942fbfcabfc420f8c10bb906cc68b58 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 16:26:13 -0700 Subject: [PATCH 145/424] Improved code [skip ci] --- pgvector/utils/halfvec.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 11ae7d1..4a14aeb 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -39,6 +39,7 @@ def from_binary(value): def to_db(value, dim=None): if value is None: return value + if not isinstance(value, HalfVector): value = HalfVector(value) @@ -50,6 +51,7 @@ def to_db(value, dim=None): def to_db_binary(value): if value is None: return value + if not isinstance(value, HalfVector): value = HalfVector(value) @@ -58,9 +60,11 @@ def to_db_binary(value): def from_db(value): if value is None or isinstance(value, HalfVector): return value + return __class__.from_text(value) def from_db_binary(value): if value is None or isinstance(value, HalfVector): return value + return __class__.from_binary(value) From 3b4643ba1ada45f9ad26759eff37d944c790af22 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 17:12:46 -0700 Subject: [PATCH 146/424] Improved HalfVector code [skip ci] --- pgvector/utils/halfvec.py | 30 ++++++++++++++++-------------- tests/test_psycopg.py | 2 ++ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 4a14aeb..590929e 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -4,35 +4,37 @@ class HalfVector: def __init__(self, value): - if isinstance(value, np.ndarray): - value = value.tolist() + value = np.asarray(value, dtype=np.float16) - if not isinstance(value, (list, tuple)): - raise ValueError('expected list or tuple') + if value.ndim != 1: + raise ValueError('expected ndim to be 1') self._value = value def __repr__(self): - return f'HalfVector({self._value})' - - def to_text(self): - return '[' + ','.join([str(float(v)) for v in self._value]) + ']' - - def to_binary(self): - return pack(f'>HH{len(self._value)}e', len(self._value), 0, *self._value) + return f'HalfVector({self.to_list()})' def dim(self): - return len(self._value) + return self._value.shape[0] def to_list(self): - return list(self._value) + return self._value.tolist() + + def to_numpy(self): + return self._value + + def to_text(self): + return '[' + ','.join([str(v) for v in self._value]) + ']' + + def to_binary(self): + return pack('>HH', self.dim(), 0) + np.array(self._value, dtype='>f2').tobytes() def from_text(value): return HalfVector([float(v) for v in value[1:-1].split(',')]) def from_binary(value): dim, unused = unpack_from('>HH', value) - return HalfVector(unpack_from(f'>{dim}e', value, 4)) + return HalfVector(np.frombuffer(value, dtype='>f2', count=dim, offset=4).astype(dtype=np.float16)) # TODO move rest diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 956a25d..a692ce0 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -73,11 +73,13 @@ def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] assert res.to_list() == [1.5, 2, 3] + assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res.to_list() == [1.5, 2, 3] + assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_bit(self): embedding = Bit([True, False, True]) From 7af59e7e73932ecf0ee20fec335fee9e175591cb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 17:17:55 -0700 Subject: [PATCH 147/424] Reduced copying for HalfVector --- pgvector/utils/halfvec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 590929e..52436b8 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -4,7 +4,7 @@ class HalfVector: def __init__(self, value): - value = np.asarray(value, dtype=np.float16) + value = np.asarray(value, dtype='>f2') if value.ndim != 1: raise ValueError('expected ndim to be 1') @@ -27,14 +27,14 @@ def to_text(self): return '[' + ','.join([str(v) for v in self._value]) + ']' def to_binary(self): - return pack('>HH', self.dim(), 0) + np.array(self._value, dtype='>f2').tobytes() + return pack('>HH', self.dim(), 0) + self._value.tobytes() def from_text(value): return HalfVector([float(v) for v in value[1:-1].split(',')]) def from_binary(value): dim, unused = unpack_from('>HH', value) - return HalfVector(np.frombuffer(value, dtype='>f2', count=dim, offset=4).astype(dtype=np.float16)) + return HalfVector(np.frombuffer(value, dtype='>f2', count=dim, offset=4)) # TODO move rest From 6f8bcfe6d5f442c786124c856ae9168896093f62 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 17:32:57 -0700 Subject: [PATCH 148/424] Reduced copying for HalfVector [skip ci] --- pgvector/utils/halfvec.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 52436b8..57ca4df 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -4,7 +4,9 @@ class HalfVector: def __init__(self, value): - value = np.asarray(value, dtype='>f2') + # asarray still copies if same dtype + if not isinstance(value, np.ndarray) or value.dtype != '>f2': + value = np.asarray(value, dtype='>f2') if value.ndim != 1: raise ValueError('expected ndim to be 1') From d1a77b5a29852d460f693caf0c3b045155977df7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 17:39:13 -0700 Subject: [PATCH 149/424] Added tests for HalfVector class --- tests/test_half_vector.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/test_half_vector.py diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py new file mode 100644 index 0000000..d059458 --- /dev/null +++ b/tests/test_half_vector.py @@ -0,0 +1,26 @@ +import numpy as np +from pgvector.utils import HalfVector +import pytest + + +class TestHalfVector: + def test_list(self): + assert HalfVector([1, 2, 3]).to_list() == [1, 2, 3] + + def test_list_str(self): + with pytest.raises(ValueError) as error: + HalfVector([1, 'two', 3]) + assert str(error.value) == "could not convert string to float: 'two'" + + def test_tuple(self): + assert HalfVector((1, 2, 3)).to_list() == [1, 2, 3] + + def test_ndarray(self): + arr = np.array([1, 2, 3]) + assert HalfVector(arr).to_list() == [1, 2, 3] + assert HalfVector(arr).to_numpy() is not arr + + def test_ndarray_same_object(self): + arr = np.array([1, 2, 3], dtype='>f2') + assert HalfVector(arr).to_list() == [1, 2, 3] + assert HalfVector(arr).to_numpy() is arr From 85ea3573184fad34bfd2e16df2abccf6a16ece7f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 17:43:27 -0700 Subject: [PATCH 150/424] Added tests for ndim [skip ci] --- tests/test_half_vector.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index d059458..9114c2c 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -24,3 +24,13 @@ def test_ndarray_same_object(self): arr = np.array([1, 2, 3], dtype='>f2') assert HalfVector(arr).to_list() == [1, 2, 3] assert HalfVector(arr).to_numpy() is arr + + def test_ndim_two(self): + with pytest.raises(ValueError) as error: + HalfVector([[1, 2], [3, 4]]) + assert str(error.value) == 'expected ndim to be 1' + + def test_ndim_zero(self): + with pytest.raises(ValueError) as error: + HalfVector(1) + assert str(error.value) == 'expected ndim to be 1' From f75cf2feadb23db05ff52a937bf9d4a1f8114be5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 18:03:17 -0700 Subject: [PATCH 151/424] Improved Vector logic [skip ci] --- pgvector/utils/vector.py | 77 +++++++++++++++++++++++---------------- tests/test_half_vector.py | 3 +- tests/test_sqlalchemy.py | 2 +- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index dd67700..adecda3 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -4,58 +4,71 @@ class Vector: def __init__(self, value): - if isinstance(value, np.ndarray): - value = value.tolist() + # asarray still copies if same dtype + if not isinstance(value, np.ndarray) or value.dtype != '>f4': + value = np.asarray(value, dtype='>f4') - if not isinstance(value, (list, tuple)): - raise ValueError('expected list or tuple') + if value.ndim != 1: + raise ValueError('expected ndim to be 1') self._value = value - def from_db(value): - # could be ndarray if already cast by lower-level driver - if value is None or isinstance(value, np.ndarray): - return value + def __repr__(self): + return f'Vector({self.to_list()})' - return np.array(value[1:-1].split(','), dtype=np.float32) + def dim(self): + return self._value.shape[0] - def from_db_binary(value): - if value is None or isinstance(value, np.ndarray): - return value + def to_list(self): + return self._value.tolist() + + def to_numpy(self): + return self._value + + def to_text(self): + return '[' + ','.join([str(v) for v in self._value]) + ']' + def to_binary(self): + return pack('>HH', self.dim(), 0) + self._value.tobytes() + + def from_text(value): + return Vector([float(v) for v in value[1:-1].split(',')]) + + def from_binary(value): dim, unused = unpack_from('>HH', value) - return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) + return Vector(np.frombuffer(value, dtype='>f4', count=dim, offset=4)) + + # TODO move rest def to_db(value, dim=None): if value is None: return value - if isinstance(value, np.ndarray): - if value.ndim != 1: - raise ValueError('expected ndim to be 1') + if not isinstance(value, Vector): + value = Vector(value) - if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): - raise ValueError('dtype must be numeric') + if dim is not None and value.dim() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) - value = value.tolist() - elif isinstance(value, Vector): - value = value._value - - if dim is not None and len(value) != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) - - return '[' + ','.join([str(float(v)) for v in value]) + ']' + return value.to_text() def to_db_binary(value): if value is None: return value - if isinstance(value, Vector): - value = value._value + if not isinstance(value, Vector): + value = Vector(value) - value = np.asarray(value, dtype='>f') + return value.to_binary() - if value.ndim != 1: - raise ValueError('expected ndim to be 1') + def from_db(value): + if value is None or isinstance(value, np.ndarray): + return value + + return __class__.from_text(value).to_numpy().astype(np.float32) + + def from_db_binary(value): + if value is None or isinstance(value, np.ndarray): + return value - return pack('>HH', value.shape[0], 0) + value.tobytes() + return __class__.from_binary(value).to_numpy().astype(np.float32) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 9114c2c..3a3f3ca 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -8,9 +8,8 @@ def test_list(self): assert HalfVector([1, 2, 3]).to_list() == [1, 2, 3] def test_list_str(self): - with pytest.raises(ValueError) as error: + with pytest.raises(ValueError, match='could not convert string to float') as error: HalfVector([1, 'two', 3]) - assert str(error.value) == "could not convert string to float: 'two'" def test_tuple(self): assert HalfVector((1, 2, 3)).to_list() == [1, 2, 3] diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 52b3e36..92bc89a 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -383,7 +383,7 @@ def test_bad_dtype(self): item = Item(embedding=np.array(['one', 'two', 'three'])) session = Session(engine) session.add(item) - with pytest.raises(StatementError, match='dtype must be numeric'): + with pytest.raises(StatementError, match='could not convert string to float'): session.commit() def test_inspect(self): From 7aef0f1ccc683fddf1cd5e4367782632b1f93eb4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 18:59:20 -0700 Subject: [PATCH 152/424] Renamed to_dense to to_list and added to_numpy to SparseVector --- pgvector/utils/sparsevec.py | 10 ++++++++-- tests/test_asyncpg.py | 2 +- tests/test_django.py | 2 +- tests/test_peewee.py | 2 +- tests/test_psycopg.py | 8 +++++--- tests/test_psycopg2.py | 2 +- tests/test_sqlalchemy.py | 2 +- tests/test_sqlmodel.py | 2 +- 8 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 1b42fef..611bb16 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -28,8 +28,14 @@ def from_dense(value): values = [value[i] for i in indices] return SparseVector(dim, indices, values) - def to_dense(self): - vec = [0] * self._dim + def to_list(self): + vec = [0.0] * self._dim + for i, v in zip(self._indices, self._values): + vec[i] = v + return vec + + def to_numpy(self): + vec = np.repeat(0.0, self._dim).astype(np.float32) for i, v in zip(self._indices, self._values): vec[i] = v return vec diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 4f0a4bd..3bfc888 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -86,7 +86,7 @@ async def test_sparsevec(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['embedding'].to_dense() == [1.5, 2, 3] + assert res[0]['embedding'].to_list() == [1.5, 2, 3] assert res[1]['embedding'] is None # ensures binary format is correct diff --git a/tests/test_django.py b/tests/test_django.py index b40a47a..1c611d9 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -192,7 +192,7 @@ def test_bit_jaccard_distance(self): def test_sparsevec(self): Item(id=1, sparse_embedding=SparseVector.from_dense([1, 2, 3])).save() item = Item.objects.get(pk=1) - assert item.sparse_embedding.to_dense() == [1, 2, 3] + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() diff --git a/tests/test_peewee.py b/tests/test_peewee.py index deb571f..1455303 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -128,7 +128,7 @@ def test_bit_jaccard_distance(self): def test_sparsevec(self): Item.create(id=1, sparse_embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert item.sparse_embedding.to_dense() == [1, 2, 3] + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index a692ce0..17483f7 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -106,17 +106,19 @@ def test_sparsevec(self): conn.execute('INSERT INTO psycopg_items (sparse_embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT sparse_embedding FROM psycopg_items ORDER BY id').fetchone()[0] - assert res.to_dense() == [1.5, 2, 3] + assert res.to_list() == [1.5, 2, 3] def test_sparsevec_binary_format(self): embedding = SparseVector.from_dense([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] - assert res.to_dense() == [1.5, 0, 2, 0, 3, 0] + assert res.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_sparsevec_text_format(self): embedding = SparseVector.from_dense([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] - assert res.to_dense() == [1.5, 0, 2, 0, 3, 0] + assert res.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_text_copy(self): embedding = np.array([1.5, 2, 3]) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index cc7e5c0..f18405f 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -51,5 +51,5 @@ def test_sparsevec(self): cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert res[0][0].to_dense() == [1.5, 2, 3] + assert res[0][0].to_list() == [1.5, 2, 3] assert res[1][0] is None diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 92bc89a..687a0d0 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -255,7 +255,7 @@ def test_sparsevec(self): session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.sparse_embedding.to_dense() == [1, 2, 3] + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 1b6baaf..90f7e21 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -158,7 +158,7 @@ def test_sparsevec(self): session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.sparse_embedding.to_dense() == [1, 2, 3] + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() From 323b0ab3ee21c09134e2477a982f8b732fe6c94b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:01:50 -0700 Subject: [PATCH 153/424] Addd test for __repr__ [skip ci] --- tests/test_half_vector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 3a3f3ca..c60433d 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -33,3 +33,6 @@ def test_ndim_zero(self): with pytest.raises(ValueError) as error: HalfVector(1) assert str(error.value) == 'expected ndim to be 1' + + def test_repr(self): + assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' From 241b087e797e84a3a585fb5b6261a6e706602a87 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:03:09 -0700 Subject: [PATCH 154/424] Added tests for Vector class [skip ci] --- tests/test_vector.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/test_vector.py diff --git a/tests/test_vector.py b/tests/test_vector.py new file mode 100644 index 0000000..f424019 --- /dev/null +++ b/tests/test_vector.py @@ -0,0 +1,38 @@ +import numpy as np +from pgvector.utils import Vector +import pytest + + +class TestVector: + def test_list(self): + assert Vector([1, 2, 3]).to_list() == [1, 2, 3] + + def test_list_str(self): + with pytest.raises(ValueError, match='could not convert string to float') as error: + Vector([1, 'two', 3]) + + def test_tuple(self): + assert Vector((1, 2, 3)).to_list() == [1, 2, 3] + + def test_ndarray(self): + arr = np.array([1, 2, 3]) + assert Vector(arr).to_list() == [1, 2, 3] + assert Vector(arr).to_numpy() is not arr + + def test_ndarray_same_object(self): + arr = np.array([1, 2, 3], dtype='>f4') + assert Vector(arr).to_list() == [1, 2, 3] + assert Vector(arr).to_numpy() is arr + + def test_ndim_two(self): + with pytest.raises(ValueError) as error: + Vector([[1, 2], [3, 4]]) + assert str(error.value) == 'expected ndim to be 1' + + def test_ndim_zero(self): + with pytest.raises(ValueError) as error: + Vector(1) + assert str(error.value) == 'expected ndim to be 1' + + def test_repr(self): + assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' From e3708976aad6bc2e7e980b139d2e64df6582a1f8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:05:19 -0700 Subject: [PATCH 155/424] Added tests for Bit class [skip ci] --- tests/test_bit.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 tests/test_bit.py diff --git a/tests/test_bit.py b/tests/test_bit.py new file mode 100644 index 0000000..35dbc82 --- /dev/null +++ b/tests/test_bit.py @@ -0,0 +1,11 @@ +import numpy as np +from pgvector.utils import Bit +import pytest + + +class TestBit: + def test_list(self): + assert str(Bit([True, False, True])) == '101' + + def test_str(self): + assert str(Bit('101')) == '101' From a64323d6e6919caa8b509415b1acc555fd94732f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:08:33 -0700 Subject: [PATCH 156/424] Added test for SparseVector class [skip ci] --- tests/test_sparse_vector.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tests/test_sparse_vector.py diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py new file mode 100644 index 0000000..488908c --- /dev/null +++ b/tests/test_sparse_vector.py @@ -0,0 +1,9 @@ +import numpy as np +from pgvector.utils import SparseVector +import pytest + + +class TestSparseVector: + def test_from_dense(self): + assert SparseVector.from_dense([1, 2, 3]).to_list() == [1, 2, 3] + assert SparseVector.from_dense([1, 2, 3]).to_numpy().tolist() == [1, 2, 3] From 140e444742de020982cba3ef908388c9a8b29680 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:12:14 -0700 Subject: [PATCH 157/424] Improved Bit code [skip ci] --- pgvector/utils/bit.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 7fc75bf..1ecbb4c 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -5,13 +5,11 @@ class Bit: def __init__(self, value): if isinstance(value, bytes): - count = unpack_from('>i', value)[0] - buf = np.frombuffer(value[4:], dtype=np.uint8) - self._value = np.unpackbits(buf, count=count).astype(bool) + self._value = __class__.from_binary(value)._value elif isinstance(value, str): - self._value = np.array([v != '0' for v in value], dtype=bool) + self._value = __class__.from_text(value)._value else: - self._value = np.array(value, dtype=bool) + self._value = np.asarray(value, dtype=bool) def __str__(self): return self.to_text() @@ -26,6 +24,14 @@ def to_binary(self): value = self._value return pack('>i', len(value)) + np.packbits(value).tobytes() + def from_text(value): + return Bit(np.asarray([v != '0' for v in value], dtype=bool)) + + def from_binary(value): + count = unpack_from('>i', value)[0] + buf = np.frombuffer(value[4:], dtype=np.uint8) + return Bit(np.unpackbits(buf, count=count).astype(bool)) + # TODO move rest def to_db(value): From 343ee3e4e5584d83419f16ffed1fbd18a8a69fbc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:19:15 -0700 Subject: [PATCH 158/424] Added more methods to Bit class [skip ci] --- pgvector/utils/bit.py | 9 +++++++++ tests/test_bit.py | 10 +++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 1ecbb4c..ce7dd2d 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -17,6 +17,15 @@ def __str__(self): def __repr__(self): return f'Bit({self})' + def dim(self): + return self._value.shape[0] + + def to_list(self): + return self._value.tolist() + + def to_numpy(self): + return self._value + def to_text(self): return ''.join(self._value.astype(np.uint8).astype(str)) diff --git a/tests/test_bit.py b/tests/test_bit.py index 35dbc82..8de6a2e 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -5,7 +5,15 @@ class TestBit: def test_list(self): - assert str(Bit([True, False, True])) == '101' + assert Bit([True, False, True]).to_list() == [True, False, True] + + def test_tuple(self): + assert Bit((True, False, True)).to_list() == [True, False, True] def test_str(self): assert str(Bit('101')) == '101' + + def test_ndarray_same_object(self): + arr = np.array([True, False, True], dtype=bool) + assert Bit(arr).to_list() == [True, False, True] + assert Bit(arr).to_numpy() is arr From 22c81a4d833bd33e84ad71fd679a4954b0efba71 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:19:34 -0700 Subject: [PATCH 159/424] Simplified test [skip ci] --- tests/test_bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bit.py b/tests/test_bit.py index 8de6a2e..1f744c6 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -14,6 +14,6 @@ def test_str(self): assert str(Bit('101')) == '101' def test_ndarray_same_object(self): - arr = np.array([True, False, True], dtype=bool) + arr = np.array([True, False, True]) assert Bit(arr).to_list() == [True, False, True] assert Bit(arr).to_numpy() is arr From 8fac4979aac664da6f93fd690a2290c16e9b8869 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:20:47 -0700 Subject: [PATCH 160/424] Added test for __repr__ [skip ci] --- tests/test_bit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_bit.py b/tests/test_bit.py index 1f744c6..e2006b9 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -17,3 +17,6 @@ def test_ndarray_same_object(self): arr = np.array([True, False, True]) assert Bit(arr).to_list() == [True, False, True] assert Bit(arr).to_numpy() is arr + + def test_repr(self): + assert repr(Bit([True, False, True])) == 'Bit(101)' From 7e17e9e3091abb168b89289acc36dff5531a6e50 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:22:11 -0700 Subject: [PATCH 161/424] Added test for __repr__ [skip ci] --- tests/test_sparse_vector.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 488908c..e4af365 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -7,3 +7,6 @@ class TestSparseVector: def test_from_dense(self): assert SparseVector.from_dense([1, 2, 3]).to_list() == [1, 2, 3] assert SparseVector.from_dense([1, 2, 3]).to_numpy().tolist() == [1, 2, 3] + + def test_repr(self): + assert repr(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1, 2, 3])' From db122fe5dc532a63322b2d10da7037525e459b34 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:31:53 -0700 Subject: [PATCH 162/424] Added check for ndim for Bit [skip ci] --- pgvector/utils/bit.py | 7 ++++++- tests/test_bit.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index ce7dd2d..d92c327 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -9,7 +9,12 @@ def __init__(self, value): elif isinstance(value, str): self._value = __class__.from_text(value)._value else: - self._value = np.asarray(value, dtype=bool) + value = np.asarray(value, dtype=bool) + + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + self._value = value def __str__(self): return self.to_text() diff --git a/tests/test_bit.py b/tests/test_bit.py index e2006b9..001ecae 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -18,5 +18,15 @@ def test_ndarray_same_object(self): assert Bit(arr).to_list() == [True, False, True] assert Bit(arr).to_numpy() is arr + def test_ndim_two(self): + with pytest.raises(ValueError) as error: + Bit([[True, False], [True, False]]) + assert str(error.value) == 'expected ndim to be 1' + + def test_ndim_zero(self): + with pytest.raises(ValueError) as error: + Bit(True) + assert str(error.value) == 'expected ndim to be 1' + def test_repr(self): assert repr(Bit([True, False, True])) == 'Bit(101)' From 33ec0f2ff007736e2fd808a8c7d1b84f290d6072 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:39:25 -0700 Subject: [PATCH 163/424] Removed from_binary path from Bit constructor [skip ci] --- pgvector/utils/bit.py | 4 +--- tests/test_psycopg.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index d92c327..da7a532 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -4,9 +4,7 @@ class Bit: def __init__(self, value): - if isinstance(value, bytes): - self._value = __class__.from_binary(value)._value - elif isinstance(value, str): + if isinstance(value, str): self._value = __class__.from_text(value)._value else: value = np.asarray(value, dtype=bool) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 17483f7..8da32f6 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -91,8 +91,8 @@ def test_bit(self): def test_bit_binary_format(self): embedding = Bit([False, True, False, True, False, False, False, False, True]) res = conn.execute('SELECT %b::bit(9)', (embedding,), binary=True).fetchone()[0] - assert str(Bit(res)) == '010100001' - assert repr(Bit(res)) == 'Bit(010100001)' + assert str(Bit.from_binary(res)) == '010100001' + assert repr(Bit.from_binary(res)) == 'Bit(010100001)' def test_bit_text_format(self): embedding = Bit([False, True, False, True, False, False, False, False, True]) From a3dc9166cbc1653f6bb7928c04ba5a0e30c8ea1b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:44:52 -0700 Subject: [PATCH 164/424] Use consistent __str__ representation [skip ci] --- pgvector/utils/bit.py | 5 +---- tests/test_bit.py | 3 ++- tests/test_half_vector.py | 1 + tests/test_psycopg.py | 2 -- tests/test_sparse_vector.py | 1 + tests/test_vector.py | 1 + 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index da7a532..9deea1f 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -14,11 +14,8 @@ def __init__(self, value): self._value = value - def __str__(self): - return self.to_text() - def __repr__(self): - return f'Bit({self})' + return f'Bit({self.to_text()})' def dim(self): return self._value.shape[0] diff --git a/tests/test_bit.py b/tests/test_bit.py index 001ecae..c42bd05 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -11,7 +11,7 @@ def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] def test_str(self): - assert str(Bit('101')) == '101' + assert Bit('101').to_list() == [True, False, True] def test_ndarray_same_object(self): arr = np.array([True, False, True]) @@ -30,3 +30,4 @@ def test_ndim_zero(self): def test_repr(self): assert repr(Bit([True, False, True])) == 'Bit(101)' + assert str(Bit([True, False, True])) == 'Bit(101)' diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index c60433d..683c6d5 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -36,3 +36,4 @@ def test_ndim_zero(self): def test_repr(self): assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' + assert str(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 8da32f6..2de1ec7 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -91,14 +91,12 @@ def test_bit(self): def test_bit_binary_format(self): embedding = Bit([False, True, False, True, False, False, False, False, True]) res = conn.execute('SELECT %b::bit(9)', (embedding,), binary=True).fetchone()[0] - assert str(Bit.from_binary(res)) == '010100001' assert repr(Bit.from_binary(res)) == 'Bit(010100001)' def test_bit_text_format(self): embedding = Bit([False, True, False, True, False, False, False, False, True]) res = conn.execute('SELECT %t::bit(9)', (embedding,)).fetchone()[0] assert res == '010100001' - assert str(Bit(res)) == '010100001' assert repr(Bit(res)) == 'Bit(010100001)' def test_sparsevec(self): diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index e4af365..49dbf6f 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -10,3 +10,4 @@ def test_from_dense(self): def test_repr(self): assert repr(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1, 2, 3])' + assert str(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1, 2, 3])' diff --git a/tests/test_vector.py b/tests/test_vector.py index f424019..8e1bd67 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -36,3 +36,4 @@ def test_ndim_zero(self): def test_repr(self): assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' + assert str(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' From f59434f938d44f4aa1cae8c965d695bc1e75e346 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:50:02 -0700 Subject: [PATCH 165/424] Added dim method to SparseVector [skip ci] --- pgvector/utils/sparsevec.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 611bb16..bc9a981 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -28,6 +28,9 @@ def from_dense(value): values = [value[i] for i in indices] return SparseVector(dim, indices, values) + def dim(self): + return self._dim + def to_list(self): vec = [0.0] * self._dim for i, v in zip(self._indices, self._values): @@ -71,8 +74,8 @@ def to_db(value, dim=None): value = to_db_value(value) - if dim is not None and value._dim != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, value._dim)) + if dim is not None and value.dim() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) return value.to_text() From 9f5f4eb6cc200c0047bb348068d5b01bcb16c6f4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 19:59:22 -0700 Subject: [PATCH 166/424] Use class name [skip ci] --- pgvector/utils/halfvec.py | 4 ++-- pgvector/utils/sparsevec.py | 4 ++-- pgvector/utils/vector.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 57ca4df..d408816 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -65,10 +65,10 @@ def from_db(value): if value is None or isinstance(value, HalfVector): return value - return __class__.from_text(value) + return HalfVector.from_text(value) def from_db_binary(value): if value is None or isinstance(value, HalfVector): return value - return __class__.from_binary(value) + return HalfVector.from_binary(value) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index bc9a981..ff39b97 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -91,10 +91,10 @@ def from_db(value): if value is None or isinstance(value, SparseVector): return value - return __class__.from_text(value) + return SparseVector.from_text(value) def from_db_binary(value): if value is None or isinstance(value, SparseVector): return value - return __class__.from_binary(value) + return SparseVector.from_binary(value) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index adecda3..2294f4f 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -65,10 +65,10 @@ def from_db(value): if value is None or isinstance(value, np.ndarray): return value - return __class__.from_text(value).to_numpy().astype(np.float32) + return Vector.from_text(value).to_numpy().astype(np.float32) def from_db_binary(value): if value is None or isinstance(value, np.ndarray): return value - return __class__.from_binary(value).to_numpy().astype(np.float32) + return Vector.from_binary(value).to_numpy().astype(np.float32) From b1d2cdb17c8abf25925b5ee0625ef9e77c5d25b2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 20:02:37 -0700 Subject: [PATCH 167/424] Improved SparseVector code [skip ci] --- pgvector/utils/sparsevec.py | 9 +++++---- tests/test_sparse_vector.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index ff39b97..e435009 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -13,9 +13,10 @@ def to_db_value(value): class SparseVector: def __init__(self, dim, indices, values): - self._dim = dim - self._indices = indices - self._values = values + # TODO improve + self._dim = int(dim) + self._indices = [int(i) for i in indices] + self._values = [float(v) for v in values] def __repr__(self): return f'SparseVector({self._dim}, {self._indices}, {self._values})' @@ -25,7 +26,7 @@ def from_dense(value): value = value.tolist() dim = len(value) indices = [i for i, v in enumerate(value) if v != 0] - values = [value[i] for i in indices] + values = [float(value[i]) for i in indices] return SparseVector(dim, indices, values) def dim(self): diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 49dbf6f..04fcf88 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -9,5 +9,5 @@ def test_from_dense(self): assert SparseVector.from_dense([1, 2, 3]).to_numpy().tolist() == [1, 2, 3] def test_repr(self): - assert repr(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1, 2, 3])' - assert str(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1, 2, 3])' + assert repr(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1.0, 2.0, 3.0])' + assert str(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1.0, 2.0, 3.0])' From 6c98554e13060d33168a453d42d6055f3cec740a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 20:13:31 -0700 Subject: [PATCH 168/424] Added casting [skip ci] --- pgvector/utils/halfvec.py | 2 +- pgvector/utils/sparsevec.py | 2 +- pgvector/utils/vector.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index d408816..16626f6 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -26,7 +26,7 @@ def to_numpy(self): return self._value def to_text(self): - return '[' + ','.join([str(v) for v in self._value]) + ']' + return '[' + ','.join([str(float(v)) for v in self._value]) + ']' def to_binary(self): return pack('>HH', self.dim(), 0) + self._value.tobytes() diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index e435009..4a8ebd4 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -45,7 +45,7 @@ def to_numpy(self): return vec def to_text(self): - return '{' + ','.join([f'{i + 1}:{v}' for i, v in zip(self._indices, self._values)]) + '}/' + str(self._dim) + return '{' + ','.join([f'{int(i) + 1}:{float(v)}' for i, v in zip(self._indices, self._values)]) + '}/' + str(int(self._dim)) def to_binary(self): nnz = len(self._indices) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 2294f4f..6264c93 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -26,7 +26,7 @@ def to_numpy(self): return self._value def to_text(self): - return '[' + ','.join([str(v) for v in self._value]) + ']' + return '[' + ','.join([str(float(v)) for v in self._value]) + ']' def to_binary(self): return pack('>HH', self.dim(), 0) + self._value.tobytes() From 1d82171235e9d55905f0962e4997380306914c60 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 20:17:56 -0700 Subject: [PATCH 169/424] Removed dim method from Bit [skip ci] --- pgvector/utils/bit.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 9deea1f..c99ddbf 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -17,9 +17,6 @@ def __init__(self, value): def __repr__(self): return f'Bit({self.to_text()})' - def dim(self): - return self._value.shape[0] - def to_list(self): return self._value.tolist() From 87249e92ad2d244da7272dc70b59af874139e4a0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 20:20:27 -0700 Subject: [PATCH 170/424] Added tests for dim methods [skip ci] --- pgvector/utils/bit.py | 3 +-- pgvector/utils/halfvec.py | 2 +- pgvector/utils/vector.py | 2 +- tests/test_half_vector.py | 3 +++ tests/test_sparse_vector.py | 3 +++ tests/test_vector.py | 3 +++ 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index c99ddbf..26c12ca 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -27,8 +27,7 @@ def to_text(self): return ''.join(self._value.astype(np.uint8).astype(str)) def to_binary(self): - value = self._value - return pack('>i', len(value)) + np.packbits(value).tobytes() + return pack('>i', len(self._value)) + np.packbits(self._value).tobytes() def from_text(value): return Bit(np.asarray([v != '0' for v in value], dtype=bool)) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 16626f6..596f04c 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -17,7 +17,7 @@ def __repr__(self): return f'HalfVector({self.to_list()})' def dim(self): - return self._value.shape[0] + return len(self._value) def to_list(self): return self._value.tolist() diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 6264c93..c21f109 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -17,7 +17,7 @@ def __repr__(self): return f'Vector({self.to_list()})' def dim(self): - return self._value.shape[0] + return len(self._value) def to_list(self): return self._value.tolist() diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 683c6d5..b2811b2 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -37,3 +37,6 @@ def test_ndim_zero(self): def test_repr(self): assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' assert str(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' + + def test_dim(self): + assert HalfVector([1, 2, 3]).dim() == 3 diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 04fcf88..34884fb 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -11,3 +11,6 @@ def test_from_dense(self): def test_repr(self): assert repr(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1.0, 2.0, 3.0])' assert str(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1.0, 2.0, 3.0])' + + def test_dim(self): + assert SparseVector.from_dense([1, 2, 3]).dim() == 3 diff --git a/tests/test_vector.py b/tests/test_vector.py index 8e1bd67..d6ecb5b 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -37,3 +37,6 @@ def test_ndim_zero(self): def test_repr(self): assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' assert str(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' + + def test_dim(self): + assert Vector([1, 2, 3]).dim() == 3 From 4bf9d32dcaa4d37d57d45441968342434c6a3f97 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 1 Jun 2024 20:51:15 -0700 Subject: [PATCH 171/424] Improved code [skip ci] --- pgvector/utils/bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 26c12ca..78533ef 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -34,7 +34,7 @@ def from_text(value): def from_binary(value): count = unpack_from('>i', value)[0] - buf = np.frombuffer(value[4:], dtype=np.uint8) + buf = np.frombuffer(value, dtype=np.uint8, offset=4) return Bit(np.unpackbits(buf, count=count).astype(bool)) # TODO move rest From 240e4135bfe4a63bdb32c2c137bfe0ca49165c37 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 3 Jun 2024 18:40:39 -0700 Subject: [PATCH 172/424] Added underscores to private functions [skip ci] --- pgvector/asyncpg/register.py | 12 ++++++------ pgvector/django/functions.py | 6 +++--- pgvector/django/halfvec.py | 6 +++--- pgvector/django/sparsevec.py | 6 +++--- pgvector/django/vector.py | 6 +++--- pgvector/peewee/halfvec.py | 4 ++-- pgvector/peewee/sparsevec.py | 4 ++-- pgvector/peewee/vector.py | 4 ++-- pgvector/psycopg/bit.py | 4 ++-- pgvector/psycopg/halfvec.py | 8 ++++---- pgvector/psycopg/sparsevec.py | 8 ++++---- pgvector/psycopg/vector.py | 8 ++++---- pgvector/psycopg2/halfvec.py | 4 ++-- pgvector/psycopg2/sparsevec.py | 4 ++-- pgvector/psycopg2/vector.py | 4 ++-- pgvector/sqlalchemy/halfvec.py | 6 +++--- pgvector/sqlalchemy/sparsevec.py | 6 +++--- pgvector/sqlalchemy/vector.py | 6 +++--- pgvector/utils/bit.py | 6 ++---- pgvector/utils/halfvec.py | 10 ++++------ pgvector/utils/sparsevec.py | 31 ++++++++++++++----------------- pgvector/utils/vector.py | 10 ++++------ 22 files changed, 77 insertions(+), 86 deletions(-) diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py index 4664218..7148ffa 100644 --- a/pgvector/asyncpg/register.py +++ b/pgvector/asyncpg/register.py @@ -4,21 +4,21 @@ async def register_vector(conn): await conn.set_type_codec( 'vector', - encoder=Vector.to_db_binary, - decoder=Vector.from_db_binary, + encoder=Vector._to_db_binary, + decoder=Vector._from_db_binary, format='binary' ) await conn.set_type_codec( 'halfvec', - encoder=HalfVector.to_db_binary, - decoder=HalfVector.from_db_binary, + encoder=HalfVector._to_db_binary, + decoder=HalfVector._from_db_binary, format='binary' ) await conn.set_type_codec( 'sparsevec', - encoder=SparseVector.to_db_binary, - decoder=SparseVector.from_db_binary, + encoder=SparseVector._to_db_binary, + decoder=SparseVector._from_db_binary, format='binary' ) diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index 9b11869..da9fbf8 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -8,11 +8,11 @@ class DistanceBase(Func): def __init__(self, expression, vector, **extra): if not hasattr(vector, 'resolve_expression'): if isinstance(vector, HalfVector): - vector = Value(HalfVector.to_db(vector)) + vector = Value(HalfVector._to_db(vector)) elif isinstance(vector, SparseVector): - vector = Value(SparseVector.to_db(vector)) + vector = Value(SparseVector._to_db(vector)) else: - vector = Value(Vector.to_db(vector)) + vector = Value(Vector._to_db(vector)) super().__init__(expression, vector, **extra) diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 1c3bae5..a7921b7 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -23,13 +23,13 @@ def db_type(self, connection): return 'halfvec(%d)' % self.dimensions def from_db_value(self, value, expression, connection): - return HalfVector.from_db(value) + return HalfVector._from_db(value) def to_python(self, value): - return HalfVector.from_db(value) + return HalfVector._from_db(value) def get_prep_value(self, value): - return HalfVector.to_db(value) + return HalfVector._to_db(value) def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index 3a06574..4ec734f 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -23,13 +23,13 @@ def db_type(self, connection): return 'sparsevec(%d)' % self.dimensions def from_db_value(self, value, expression, connection): - return SparseVector.from_db(value) + return SparseVector._from_db(value) def to_python(self, value): - return SparseVector.from_db(value) + return SparseVector._from_db(value) def get_prep_value(self, value): - return SparseVector.to_db(value) + return SparseVector._to_db(value) def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py index 30fd99f..a89d540 100644 --- a/pgvector/django/vector.py +++ b/pgvector/django/vector.py @@ -25,15 +25,15 @@ def db_type(self, connection): return 'vector(%d)' % self.dimensions def from_db_value(self, value, expression, connection): - return Vector.from_db(value) + return Vector._from_db(value) def to_python(self, value): if isinstance(value, list): return np.array(value, dtype=np.float32) - return Vector.from_db(value) + return Vector._from_db(value) def get_prep_value(self, value): - return Vector.to_db(value) + return Vector._to_db(value) def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index e30dcd0..bed7d1f 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -13,10 +13,10 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return HalfVector.to_db(value) + return HalfVector._to_db(value) def python_value(self, value): - return HalfVector.from_db(value) + return HalfVector._from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index b6d3b91..8bba5cf 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -13,10 +13,10 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return SparseVector.to_db(value) + return SparseVector._to_db(value) def python_value(self, value): - return SparseVector.from_db(value) + return SparseVector._from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py index 3c11e74..a9ebf87 100644 --- a/pgvector/peewee/vector.py +++ b/pgvector/peewee/vector.py @@ -13,10 +13,10 @@ def get_modifiers(self): return self.dimensions and [self.dimensions] or None def db_value(self, value): - return Vector.to_db(value) + return Vector._to_db(value) def python_value(self, value): - return Vector.from_db(value) + return Vector._from_db(value) def _distance(self, op, vector): return Expression(lhs=self, op=op, rhs=self.to_value(vector)) diff --git a/pgvector/psycopg/bit.py b/pgvector/psycopg/bit.py index 80cfac0..605c6f0 100644 --- a/pgvector/psycopg/bit.py +++ b/pgvector/psycopg/bit.py @@ -8,7 +8,7 @@ class BitDumper(Dumper): format = Format.TEXT def dump(self, obj): - return Bit.to_db(obj).encode('utf8') + return Bit._to_db(obj).encode('utf8') class BitBinaryDumper(BitDumper): @@ -16,7 +16,7 @@ class BitBinaryDumper(BitDumper): format = Format.BINARY def dump(self, obj): - return Bit.to_db_binary(obj) + return Bit._to_db_binary(obj) def register_bit_info(context, info): diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py index 6ca232d..351d2cb 100644 --- a/pgvector/psycopg/halfvec.py +++ b/pgvector/psycopg/halfvec.py @@ -8,7 +8,7 @@ class HalfVectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return HalfVector.to_db(obj).encode('utf8') + return HalfVector._to_db(obj).encode('utf8') class HalfVectorBinaryDumper(HalfVectorDumper): @@ -16,7 +16,7 @@ class HalfVectorBinaryDumper(HalfVectorDumper): format = Format.BINARY def dump(self, obj): - return HalfVector.to_db_binary(obj) + return HalfVector._to_db_binary(obj) class HalfVectorLoader(Loader): @@ -26,7 +26,7 @@ class HalfVectorLoader(Loader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return HalfVector.from_db(data.decode('utf8')) + return HalfVector._from_db(data.decode('utf8')) class HalfVectorBinaryLoader(HalfVectorLoader): @@ -36,7 +36,7 @@ class HalfVectorBinaryLoader(HalfVectorLoader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return HalfVector.from_db_binary(data) + return HalfVector._from_db_binary(data) def register_halfvec_info(context, info): diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py index 634d83b..435fd06 100644 --- a/pgvector/psycopg/sparsevec.py +++ b/pgvector/psycopg/sparsevec.py @@ -8,7 +8,7 @@ class SparseVectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return SparseVector.to_db(obj).encode('utf8') + return SparseVector._to_db(obj).encode('utf8') class SparseVectorBinaryDumper(SparseVectorDumper): @@ -16,7 +16,7 @@ class SparseVectorBinaryDumper(SparseVectorDumper): format = Format.BINARY def dump(self, obj): - return SparseVector.to_db_binary(obj) + return SparseVector._to_db_binary(obj) class SparseVectorLoader(Loader): @@ -26,7 +26,7 @@ class SparseVectorLoader(Loader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return SparseVector.from_db(data.decode('utf8')) + return SparseVector._from_db(data.decode('utf8')) class SparseVectorBinaryLoader(SparseVectorLoader): @@ -36,7 +36,7 @@ class SparseVectorBinaryLoader(SparseVectorLoader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return SparseVector.from_db_binary(data) + return SparseVector._from_db_binary(data) def register_sparsevec_info(context, info): diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index 4875fe2..55e31a6 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -8,7 +8,7 @@ class VectorDumper(Dumper): format = Format.TEXT def dump(self, obj): - return Vector.to_db(obj).encode('utf8') + return Vector._to_db(obj).encode('utf8') class VectorBinaryDumper(VectorDumper): @@ -16,7 +16,7 @@ class VectorBinaryDumper(VectorDumper): format = Format.BINARY def dump(self, obj): - return Vector.to_db_binary(obj) + return Vector._to_db_binary(obj) class VectorLoader(Loader): @@ -26,7 +26,7 @@ class VectorLoader(Loader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return Vector.from_db(data.decode('utf8')) + return Vector._from_db(data.decode('utf8')) class VectorBinaryLoader(VectorLoader): @@ -36,7 +36,7 @@ class VectorBinaryLoader(VectorLoader): def load(self, data): if isinstance(data, memoryview): data = bytes(data) - return Vector.from_db_binary(data) + return Vector._from_db_binary(data) def register_vector_info(context, info): diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index b480911..e3c0cdf 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -7,11 +7,11 @@ def __init__(self, value): self._value = value def getquoted(self): - return adapt(HalfVector.to_db(self._value)).getquoted() + return adapt(HalfVector._to_db(self._value)).getquoted() def cast_halfvec(value, cur): - return HalfVector.from_db(value) + return HalfVector._from_db(value) def register_halfvec_info(oid): diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index 31e86f3..7cdf38c 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -7,11 +7,11 @@ def __init__(self, value): self._value = value def getquoted(self): - return adapt(SparseVector.to_db(self._value)).getquoted() + return adapt(SparseVector._to_db(self._value)).getquoted() def cast_sparsevec(value, cur): - return SparseVector.from_db(value) + return SparseVector._from_db(value) def register_sparsevec_info(oid): diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 4fdb077..c895f86 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -8,11 +8,11 @@ def __init__(self, value): self._value = value def getquoted(self): - return adapt(Vector.to_db(self._value)).getquoted() + return adapt(Vector._to_db(self._value)).getquoted() def cast_vector(value, cur): - return Vector.from_db(value) + return Vector._from_db(value) def register_vector_info(oid): diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py index 1c5f8f1..639f77b 100644 --- a/pgvector/sqlalchemy/halfvec.py +++ b/pgvector/sqlalchemy/halfvec.py @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return HalfVector.to_db(value, self.dim) + return HalfVector._to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(HalfVector.to_db(value, self.dim)) + return string_literal_processor(HalfVector._to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return HalfVector.from_db(value) + return HalfVector._from_db(value) return process class comparator_factory(UserDefinedType.Comparator): diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py index ca5e9aa..370f5d1 100644 --- a/pgvector/sqlalchemy/sparsevec.py +++ b/pgvector/sqlalchemy/sparsevec.py @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return SparseVector.to_db(value, self.dim) + return SparseVector._to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(SparseVector.to_db(value, self.dim)) + return string_literal_processor(SparseVector._to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return SparseVector.from_db(value) + return SparseVector._from_db(value) return process class comparator_factory(UserDefinedType.Comparator): diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py index b2ce7f7..f57a045 100644 --- a/pgvector/sqlalchemy/vector.py +++ b/pgvector/sqlalchemy/vector.py @@ -18,19 +18,19 @@ def get_col_spec(self, **kw): def bind_processor(self, dialect): def process(value): - return Vector.to_db(value, self.dim) + return Vector._to_db(value, self.dim) return process def literal_processor(self, dialect): string_literal_processor = self._string._cached_literal_processor(dialect) def process(value): - return string_literal_processor(Vector.to_db(value, self.dim)) + return string_literal_processor(Vector._to_db(value, self.dim)) return process def result_processor(self, dialect, coltype): def process(value): - return Vector.from_db(value) + return Vector._from_db(value) return process class comparator_factory(UserDefinedType.Comparator): diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 78533ef..da342a0 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -37,15 +37,13 @@ def from_binary(value): buf = np.frombuffer(value, dtype=np.uint8, offset=4) return Bit(np.unpackbits(buf, count=count).astype(bool)) - # TODO move rest - - def to_db(value): + def _to_db(value): if not isinstance(value, Bit): raise ValueError('expected bit') return value.to_text() - def to_db_binary(value): + def _to_db_binary(value): if not isinstance(value, Bit): raise ValueError('expected bit') diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index 596f04c..e2836c8 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -38,9 +38,7 @@ def from_binary(value): dim, unused = unpack_from('>HH', value) return HalfVector(np.frombuffer(value, dtype='>f2', count=dim, offset=4)) - # TODO move rest - - def to_db(value, dim=None): + def _to_db(value, dim=None): if value is None: return value @@ -52,7 +50,7 @@ def to_db(value, dim=None): return value.to_text() - def to_db_binary(value): + def _to_db_binary(value): if value is None: return value @@ -61,13 +59,13 @@ def to_db_binary(value): return value.to_binary() - def from_db(value): + def _from_db(value): if value is None or isinstance(value, HalfVector): return value return HalfVector.from_text(value) - def from_db_binary(value): + def _from_db_binary(value): if value is None or isinstance(value, HalfVector): return value diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 4a8ebd4..ace268a 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -2,15 +2,6 @@ from struct import pack, unpack_from -def to_db_value(value): - if isinstance(value, SparseVector): - return value - elif isinstance(value, (list, np.ndarray)): - return SparseVector.from_dense(value) - else: - raise ValueError('expected sparsevec') - - class SparseVector: def __init__(self, dim, indices, values): # TODO improve @@ -67,34 +58,40 @@ def from_binary(value): values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) return SparseVector(int(dim), indices, values) - # TODO move rest - - def to_db(value, dim=None): + def _to_db(value, dim=None): if value is None: return value - value = to_db_value(value) + value = __class__._to_db_value(value) if dim is not None and value.dim() != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) return value.to_text() - def to_db_binary(value): + def _to_db_binary(value): if value is None: return value - value = to_db_value(value) + value = __class__._to_db_value(value) return value.to_binary() - def from_db(value): + def _to_db_value(value): + if isinstance(value, SparseVector): + return value + elif isinstance(value, (list, np.ndarray)): + return SparseVector.from_dense(value) + else: + raise ValueError('expected sparsevec') + + def _from_db(value): if value is None or isinstance(value, SparseVector): return value return SparseVector.from_text(value) - def from_db_binary(value): + def _from_db_binary(value): if value is None or isinstance(value, SparseVector): return value diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index c21f109..4c70b34 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -38,9 +38,7 @@ def from_binary(value): dim, unused = unpack_from('>HH', value) return Vector(np.frombuffer(value, dtype='>f4', count=dim, offset=4)) - # TODO move rest - - def to_db(value, dim=None): + def _to_db(value, dim=None): if value is None: return value @@ -52,7 +50,7 @@ def to_db(value, dim=None): return value.to_text() - def to_db_binary(value): + def _to_db_binary(value): if value is None: return value @@ -61,13 +59,13 @@ def to_db_binary(value): return value.to_binary() - def from_db(value): + def _from_db(value): if value is None or isinstance(value, np.ndarray): return value return Vector.from_text(value).to_numpy().astype(np.float32) - def from_db_binary(value): + def _from_db_binary(value): if value is None or isinstance(value, np.ndarray): return value From f1ce5f3f11d351c1fca19b9b7ff3ed99f6f1db70 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 3 Jun 2024 18:42:36 -0700 Subject: [PATCH 173/424] Removed unneeded code [skip ci] --- pgvector/utils/sparsevec.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index ace268a..0fde7f1 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -13,8 +13,6 @@ def __repr__(self): return f'SparseVector({self._dim}, {self._indices}, {self._values})' def from_dense(value): - if isinstance(value, np.ndarray): - value = value.tolist() dim = len(value) indices = [i for i, v in enumerate(value) if v != 0] values = [float(value[i]) for i in indices] From b43b58ff7536dd3cdecad5aaa8c5c18a7746827a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 14:17:55 -0700 Subject: [PATCH 174/424] Use classmethod decorator - closes #72 Co-authored-by: "domenico.cinque" --- pgvector/utils/bit.py | 10 ++++++---- pgvector/utils/halfvec.py | 38 +++++++++++++++++++++---------------- pgvector/utils/sparsevec.py | 31 +++++++++++++++++------------- pgvector/utils/vector.py | 24 +++++++++++++---------- 4 files changed, 60 insertions(+), 43 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index da342a0..b5ede20 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -37,14 +37,16 @@ def from_binary(value): buf = np.frombuffer(value, dtype=np.uint8, offset=4) return Bit(np.unpackbits(buf, count=count).astype(bool)) - def _to_db(value): - if not isinstance(value, Bit): + @classmethod + def _to_db(cls, value): + if not isinstance(value, cls): raise ValueError('expected bit') return value.to_text() - def _to_db_binary(value): - if not isinstance(value, Bit): + @classmethod + def _to_db_binary(cls, value): + if not isinstance(value, cls): raise ValueError('expected bit') return value.to_binary() diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index e2836c8..d01b381 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -31,42 +31,48 @@ def to_text(self): def to_binary(self): return pack('>HH', self.dim(), 0) + self._value.tobytes() - def from_text(value): - return HalfVector([float(v) for v in value[1:-1].split(',')]) + @classmethod + def from_text(cls, value): + return cls([float(v) for v in value[1:-1].split(',')]) - def from_binary(value): + @classmethod + def from_binary(cls, value): dim, unused = unpack_from('>HH', value) - return HalfVector(np.frombuffer(value, dtype='>f2', count=dim, offset=4)) + return cls(np.frombuffer(value, dtype='>f2', count=dim, offset=4)) - def _to_db(value, dim=None): + @classmethod + def _to_db(cls, value, dim=None): if value is None: return value - if not isinstance(value, HalfVector): - value = HalfVector(value) + if not isinstance(value, cls): + value = cls(value) if dim is not None and value.dim() != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) return value.to_text() - def _to_db_binary(value): + @classmethod + def _to_db_binary(cls, value): if value is None: return value - if not isinstance(value, HalfVector): - value = HalfVector(value) + if not isinstance(value, cls): + value = cls(value) return value.to_binary() - def _from_db(value): - if value is None or isinstance(value, HalfVector): + @classmethod + def _from_db(cls, value): + if value is None or isinstance(value, cls): return value - return HalfVector.from_text(value) + return cls.from_text(value) - def _from_db_binary(value): - if value is None or isinstance(value, HalfVector): + @classmethod + def _from_db_binary(cls, value): + if value is None or isinstance(value, cls): return value - return HalfVector.from_binary(value) + return cls.from_binary(value) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 0fde7f1..18e510d 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -56,41 +56,46 @@ def from_binary(value): values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) return SparseVector(int(dim), indices, values) - def _to_db(value, dim=None): + @classmethod + def _to_db(cls, value, dim=None): if value is None: return value - value = __class__._to_db_value(value) + value = cls._to_db_value(value) if dim is not None and value.dim() != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) return value.to_text() - def _to_db_binary(value): + @classmethod + def _to_db_binary(cls, value): if value is None: return value - value = __class__._to_db_value(value) + value = cls._to_db_value(value) return value.to_binary() - def _to_db_value(value): - if isinstance(value, SparseVector): + @classmethod + def _to_db_value(cls, value): + if isinstance(value, cls): return value elif isinstance(value, (list, np.ndarray)): - return SparseVector.from_dense(value) + return cls.from_dense(value) else: raise ValueError('expected sparsevec') - def _from_db(value): - if value is None or isinstance(value, SparseVector): + @classmethod + def _from_db(cls, value): + if value is None or isinstance(value, cls): return value - return SparseVector.from_text(value) + return cls.from_text(value) - def _from_db_binary(value): - if value is None or isinstance(value, SparseVector): + @classmethod + def _from_db_binary(cls, value): + if value is None or isinstance(value, cls): return value - return SparseVector.from_binary(value) + return cls.from_binary(value) diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 4c70b34..1ddba4c 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -38,35 +38,39 @@ def from_binary(value): dim, unused = unpack_from('>HH', value) return Vector(np.frombuffer(value, dtype='>f4', count=dim, offset=4)) - def _to_db(value, dim=None): + @classmethod + def _to_db(cls, value, dim=None): if value is None: return value - if not isinstance(value, Vector): - value = Vector(value) + if not isinstance(value, cls): + value = cls(value) if dim is not None and value.dim() != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) return value.to_text() - def _to_db_binary(value): + @classmethod + def _to_db_binary(cls, value): if value is None: return value - if not isinstance(value, Vector): - value = Vector(value) + if not isinstance(value, cls): + value = cls(value) return value.to_binary() - def _from_db(value): + @classmethod + def _from_db(cls, value): if value is None or isinstance(value, np.ndarray): return value - return Vector.from_text(value).to_numpy().astype(np.float32) + return cls.from_text(value).to_numpy().astype(np.float32) - def _from_db_binary(value): + @classmethod + def _from_db_binary(cls, value): if value is None or isinstance(value, np.ndarray): return value - return Vector.from_binary(value).to_numpy().astype(np.float32) + return cls.from_binary(value).to_numpy().astype(np.float32) From 3a4118ca3687a5ff366087924b0aacb523bab66e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 15:26:07 -0700 Subject: [PATCH 175/424] Improved SparseVector tests [skip ci] --- tests/test_sparse_vector.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 34884fb..92c6432 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -5,12 +5,12 @@ class TestSparseVector: def test_from_dense(self): - assert SparseVector.from_dense([1, 2, 3]).to_list() == [1, 2, 3] - assert SparseVector.from_dense([1, 2, 3]).to_numpy().tolist() == [1, 2, 3] + assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] def test_repr(self): - assert repr(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1.0, 2.0, 3.0])' - assert str(SparseVector.from_dense([1, 2, 3])) == 'SparseVector(3, [0, 1, 2], [1.0, 2.0, 3.0])' + assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' + assert str(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' def test_dim(self): - assert SparseVector.from_dense([1, 2, 3]).dim() == 3 + assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).dim() == 6 From fb8edf9da0fc1d2bab273e283689ec2dc93cdbe5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 15:29:56 -0700 Subject: [PATCH 176/424] Use classmethod decorator for more methods --- pgvector/utils/bit.py | 10 ++++++---- pgvector/utils/sparsevec.py | 10 ++++++---- pgvector/utils/vector.py | 10 ++++++---- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index b5ede20..3b27db3 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -29,13 +29,15 @@ def to_text(self): def to_binary(self): return pack('>i', len(self._value)) + np.packbits(self._value).tobytes() - def from_text(value): - return Bit(np.asarray([v != '0' for v in value], dtype=bool)) + @classmethod + def from_text(cls, value): + return cls(np.asarray([v != '0' for v in value], dtype=bool)) - def from_binary(value): + @classmethod + def from_binary(cls, value): count = unpack_from('>i', value)[0] buf = np.frombuffer(value, dtype=np.uint8, offset=4) - return Bit(np.unpackbits(buf, count=count).astype(bool)) + return cls(np.unpackbits(buf, count=count).astype(bool)) @classmethod def _to_db(cls, value): diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 18e510d..8d3f866 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -40,7 +40,8 @@ def to_binary(self): nnz = len(self._indices) return pack(f'>iii{nnz}i{nnz}f', self._dim, nnz, 0, *self._indices, *self._values) - def from_text(value): + @classmethod + def from_text(cls, value): elements, dim = value.split('/') indices = [] values = [] @@ -48,13 +49,14 @@ def from_text(value): i, v = e.split(':') indices.append(int(i) - 1) values.append(float(v)) - return SparseVector(int(dim), indices, values) + return cls(int(dim), indices, values) - def from_binary(value): + @classmethod + def from_binary(cls, value): dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) - return SparseVector(int(dim), indices, values) + return cls(int(dim), indices, values) @classmethod def _to_db(cls, value, dim=None): diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 1ddba4c..7a0f6bc 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -31,12 +31,14 @@ def to_text(self): def to_binary(self): return pack('>HH', self.dim(), 0) + self._value.tobytes() - def from_text(value): - return Vector([float(v) for v in value[1:-1].split(',')]) + @classmethod + def from_text(cls, value): + return cls([float(v) for v in value[1:-1].split(',')]) - def from_binary(value): + @classmethod + def from_binary(cls, value): dim, unused = unpack_from('>HH', value) - return Vector(np.frombuffer(value, dtype='>f4', count=dim, offset=4)) + return cls(np.frombuffer(value, dtype='>f4', count=dim, offset=4)) @classmethod def _to_db(cls, value, dim=None): From 63bc2507e47a0a87cc941044a40529b9b4b677cd Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 15:30:54 -0700 Subject: [PATCH 177/424] Use classmethod decorator for from_dense [skip ci] --- pgvector/utils/sparsevec.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 8d3f866..962938b 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -12,11 +12,12 @@ def __init__(self, dim, indices, values): def __repr__(self): return f'SparseVector({self._dim}, {self._indices}, {self._values})' - def from_dense(value): + @classmethod + def from_dense(cls, value): dim = len(value) indices = [i for i, v in enumerate(value) if v != 0] values = [float(value[i]) for i in indices] - return SparseVector(dim, indices, values) + return cls(dim, indices, values) def dim(self): return self._dim From df3427dace3cf9de6fe231229c319b3c1b958e58 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 15:36:51 -0700 Subject: [PATCH 178/424] Added from_coordinates method to SparseVector --- pgvector/utils/sparsevec.py | 10 ++++++++++ tests/test_sparse_vector.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 962938b..9254e54 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -12,6 +12,16 @@ def __init__(self, dim, indices, values): def __repr__(self): return f'SparseVector({self._dim}, {self._indices}, {self._values})' + @classmethod + def from_coordinates(cls, coordinates, dim): + if isinstance(coordinates, dict): + coordinates = coordinates.items() + elements = [(i, v) for i, v in coordinates] + elements.sort() + indices = [int(v[0]) for v in elements] + values = [float(v[1]) for v in elements] + return cls(dim, indices, values) + @classmethod def from_dense(cls, value): dim = len(value) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 92c6432..5fb4b7e 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -8,6 +8,10 @@ def test_from_dense(self): assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_list() == [1, 0, 2, 0, 3, 0] assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] + def test_from_coordinates(self): + assert SparseVector.from_coordinates({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_coordinates([(0, 1), (2, 2), (4, 3)], 6).to_list() == [1, 0, 2, 0, 3, 0] + def test_repr(self): assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' assert str(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' From 08a73cdf7dc062ed11c0b8b0227a7def850ae144 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 16:13:47 -0700 Subject: [PATCH 179/424] Removed support for array from from_coordinates [skip ci] --- pgvector/utils/sparsevec.py | 4 +--- tests/test_sparse_vector.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 9254e54..afcd856 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -14,9 +14,7 @@ def __repr__(self): @classmethod def from_coordinates(cls, coordinates, dim): - if isinstance(coordinates, dict): - coordinates = coordinates.items() - elements = [(i, v) for i, v in coordinates] + elements = [(i, v) for i, v in coordinates.items()] elements.sort() indices = [int(v[0]) for v in elements] values = [float(v[1]) for v in elements] diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 5fb4b7e..f6f96e0 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -10,7 +10,6 @@ def test_from_dense(self): def test_from_coordinates(self): assert SparseVector.from_coordinates({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_coordinates([(0, 1), (2, 2), (4, 3)], 6).to_list() == [1, 0, 2, 0, 3, 0] def test_repr(self): assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' From 999af56edf4ee95bf54783c242dec8834d9daf32 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 16:15:23 -0700 Subject: [PATCH 180/424] Improved SparseVector test [skip ci] --- tests/test_sparse_vector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index f6f96e0..f622108 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -7,6 +7,7 @@ class TestSparseVector: def test_from_dense(self): assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_list() == [1, 0, 2, 0, 3, 0] assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_dense(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] def test_from_coordinates(self): assert SparseVector.from_coordinates({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] From c708d8f0eaedd4608109aca79bd97bd08b5c4528 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 17:47:58 -0700 Subject: [PATCH 181/424] Updated function name [skip ci] --- pgvector/utils/sparsevec.py | 4 ++-- tests/test_sparse_vector.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index afcd856..0a3df3b 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -13,8 +13,8 @@ def __repr__(self): return f'SparseVector({self._dim}, {self._indices}, {self._values})' @classmethod - def from_coordinates(cls, coordinates, dim): - elements = [(i, v) for i, v in coordinates.items()] + def from_dict(cls, d, dim): + elements = [(i, v) for i, v in d.items()] elements.sort() indices = [int(v[0]) for v in elements] values = [float(v[1]) for v in elements] diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index f622108..0f69b8a 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -10,7 +10,7 @@ def test_from_dense(self): assert SparseVector.from_dense(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] def test_from_coordinates(self): - assert SparseVector.from_coordinates({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_dict({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] def test_repr(self): assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' From 1b5dc44cc8cbe5a7885989da6f0cc0a173b2f474 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 18:04:02 -0700 Subject: [PATCH 182/424] Added from_scipy method to SparseVector --- pgvector/utils/sparsevec.py | 12 ++++++++++++ requirements.txt | 1 + tests/test_sparse_vector.py | 6 ++++++ 3 files changed, 19 insertions(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 0a3df3b..b1a53d7 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -20,6 +20,18 @@ def from_dict(cls, d, dim): values = [float(v[1]) for v in elements] return cls(dim, indices, values) + @classmethod + def from_scipy(cls, value): + value = value.tocoo() + + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + dim = value.shape[0] + indices = value.coords[0].tolist() + values = value.data.tolist() + return cls(dim, indices, values) + @classmethod def from_dense(cls, value): dim = len(value) diff --git a/requirements.txt b/requirements.txt index da0ef44..c1e11f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,6 @@ psycopg[binary] psycopg2-binary pytest pytest-asyncio +scipy SQLAlchemy[asyncio]>=2 sqlmodel>=0.0.12 diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 0f69b8a..f79d62a 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,6 +1,7 @@ import numpy as np from pgvector.utils import SparseVector import pytest +from scipy.sparse import coo_array class TestSparseVector: @@ -12,6 +13,11 @@ def test_from_dense(self): def test_from_coordinates(self): assert SparseVector.from_dict({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] + def test_from_scipy(self): + arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) + assert SparseVector.from_scipy(arr).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_scipy(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] + def test_repr(self): assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' assert str(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' From 550aa6403daf8aa0becac1b3e652aab92407e844 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 18:05:49 -0700 Subject: [PATCH 183/424] Renamed from_scipy method to from_sparse [skip ci] --- pgvector/utils/sparsevec.py | 2 +- tests/test_sparse_vector.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index b1a53d7..7de4607 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -21,7 +21,7 @@ def from_dict(cls, d, dim): return cls(dim, indices, values) @classmethod - def from_scipy(cls, value): + def from_sparse(cls, value): value = value.tocoo() if value.ndim != 1: diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index f79d62a..f447442 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -13,10 +13,10 @@ def test_from_dense(self): def test_from_coordinates(self): assert SparseVector.from_dict({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] - def test_from_scipy(self): + def test_from_sparse(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) - assert SparseVector.from_scipy(arr).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_scipy(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_sparse(arr).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_sparse(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] def test_repr(self): assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' From d858fcdfaee47c709566be8ff0b00decbb9d4810 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 18:21:40 -0700 Subject: [PATCH 184/424] Fixed from_sparse method for scipy < 1.13 --- pgvector/utils/sparsevec.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 7de4607..c4aa4f1 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -24,11 +24,18 @@ def from_dict(cls, d, dim): def from_sparse(cls, value): value = value.tocoo() - if value.ndim != 1: + if value.ndim == 1: + dim = value.shape[0] + elif value.ndim == 2 and value.shape[0] == 1: + dim = value.shape[1] + else: raise ValueError('expected ndim to be 1') - dim = value.shape[0] - indices = value.coords[0].tolist() + if hasattr(value, 'coords'): + # scipy 1.13+ + indices = value.coords[0].tolist() + else: + indices = value.col.tolist() values = value.data.tolist() return cls(dim, indices, values) From b3851e5ac1ac0dcda7221c66d568e889fb22516f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 4 Jun 2024 18:49:32 -0700 Subject: [PATCH 185/424] Improved code [skip ci] --- pgvector/utils/bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 3b27db3..8a6f2ac 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -5,7 +5,7 @@ class Bit: def __init__(self, value): if isinstance(value, str): - self._value = __class__.from_text(value)._value + self._value = self.from_text(value)._value else: value = np.asarray(value, dtype=bool) From 33eb182c8cbb638f4b6266e0518b14d0beed21d8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 13:50:55 -0700 Subject: [PATCH 186/424] Added more tests [skip ci] --- tests/test_sparse_vector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index f447442..c799fa3 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -16,6 +16,8 @@ def test_from_coordinates(self): def test_from_sparse(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) assert SparseVector.from_sparse(arr).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_sparse(arr.tocsc()).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector.from_sparse(arr.tocsr()).to_list() == [1, 0, 2, 0, 3, 0] assert SparseVector.from_sparse(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] def test_repr(self): From 8eaad5c3e83d2f2156d1d550a898db07f4c55740 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 13:56:12 -0700 Subject: [PATCH 187/424] Added to_dict method to SparseVector --- pgvector/utils/sparsevec.py | 3 +++ tests/test_sparse_vector.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index c4aa4f1..f33e496 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -49,6 +49,9 @@ def from_dense(cls, value): def dim(self): return self._dim + def to_dict(self): + return {i: v for i, v in zip(self._indices, self._values)} + def to_list(self): vec = [0.0] * self._dim for i, v in zip(self._indices, self._values): diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index c799fa3..c1258dc 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -26,3 +26,6 @@ def test_repr(self): def test_dim(self): assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).dim() == 6 + + def test_to_dict(self): + assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_dict() == {0: 1, 2: 2, 4: 3} From d0a50651bfdaf4ede4feed284f34a22932f52bb7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 13:59:10 -0700 Subject: [PATCH 188/424] Fixed CI --- tests/test_sparse_vector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index c1258dc..73986b3 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -16,8 +16,6 @@ def test_from_coordinates(self): def test_from_sparse(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) assert SparseVector.from_sparse(arr).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_sparse(arr.tocsc()).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_sparse(arr.tocsr()).to_list() == [1, 0, 2, 0, 3, 0] assert SparseVector.from_sparse(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] def test_repr(self): From f36aa11a32e8c09196b52061466b3acd66ecd188 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 14:08:41 -0700 Subject: [PATCH 189/424] Simplified code [skip ci] --- pgvector/utils/sparsevec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index f33e496..7112736 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -50,7 +50,7 @@ def dim(self): return self._dim def to_dict(self): - return {i: v for i, v in zip(self._indices, self._values)} + return dict(zip(self._indices, self._values)) def to_list(self): vec = [0.0] * self._dim From 8be15b29e90082b01a173503624659437035cde5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 14:54:21 -0700 Subject: [PATCH 190/424] Added to_coo method to SparseVector --- pgvector/utils/sparsevec.py | 6 ++++++ tests/test_sparse_vector.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 7112736..ebfa0a9 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -52,6 +52,12 @@ def dim(self): def to_dict(self): return dict(zip(self._indices, self._values)) + def to_coo(self): + from scipy.sparse import coo_array + + coords = ([0] * len(self._indices), self._indices) + return coo_array((self._values, coords), shape=(1, self._dim)) + def to_list(self): vec = [0.0] * self._dim for i, v in zip(self._indices, self._values): diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 73986b3..c4565fb 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -27,3 +27,6 @@ def test_dim(self): def test_to_dict(self): assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_dict() == {0: 1, 2: 2, 4: 3} + + def test_to_coo(self): + assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] From 3fa840e8046410fdefcea5e613314058c6a419b1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 11 Jun 2024 10:41:55 -0700 Subject: [PATCH 191/424] Added test for Automap with SQLAlchemy - #74 --- tests/test_sqlalchemy.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 687a0d0..1c0fb80 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -3,6 +3,7 @@ import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError +from sqlalchemy.ext.automap import automap_base from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.orm import declarative_base, mapped_column, Session from sqlalchemy.sql import func @@ -403,6 +404,16 @@ def test_insert_bulk(self): def test_insert_text(self): session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + def test_automap(self): + metadata = MetaData() + metadata.reflect(engine, only=['sqlalchemy_orm_item']) + AutoBase = automap_base(metadata=metadata) + AutoBase.prepare() + AutoItem = AutoBase.classes.sqlalchemy_orm_item + session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) + item = session.query(AutoItem).first() + assert item.embedding.tolist() == [1, 2, 3] + @pytest.mark.asyncio async def test_async(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') From 0c8e1f0608f407e006a7514e95db60ebbcc8cd97 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 09:43:17 -0700 Subject: [PATCH 192/424] Improved Django tests --- tests/test_django.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 1c611d9..4fe4000 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -93,12 +93,18 @@ def create_items(): Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2])).save() -class ItemForm(ModelForm): +class VectorForm(ModelForm): class Meta: model = Item fields = ['embedding'] +class BitForm(ModelForm): + class Meta: + model = Item + fields = ['binary_embedding'] + + class TestDjango: def setup_method(self, test_method): Item.objects.all().delete() @@ -270,36 +276,41 @@ def test_serialization(self): for obj in serializers.deserialize(format, data): obj.save() - def test_form(self): - form = ItemForm(data={'embedding': '[1, 2, 3]'}) + def test_vector_form(self): + form = VectorForm(data={'embedding': '[1, 2, 3]'}) assert form.is_valid() assert 'value="[1, 2, 3]"' in form.as_div() - def test_form_instance(self): + def test_vector_form_instance(self): Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - form = ItemForm(instance=item) + form = VectorForm(instance=item) assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() - def test_form_save(self): + def test_vector_form_save(self): Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - form = ItemForm(instance=item, data={'embedding': '[4, 5, 6]'}) + form = VectorForm(instance=item, data={'embedding': '[4, 5, 6]'}) assert form.has_changed() assert form.is_valid() assert form.save() assert [4, 5, 6] == Item.objects.get(pk=1).embedding.tolist() - def test_form_save_missing(self): + def test_vector_form_save_missing(self): Item(id=1).save() item = Item.objects.get(pk=1) - form = ItemForm(instance=item, data={'embedding': ''}) + form = VectorForm(instance=item, data={'embedding': ''}) assert form.is_valid() assert form.save() assert Item.objects.get(pk=1).embedding is None + def test_bit_form(self): + form = BitForm(data={'binary_embedding': '101'}) + assert form.is_valid() + assert 'value="101"' in form.as_div() + def test_clean(self): - item = Item(id=1, embedding=[1, 2, 3]) + item = Item(id=1, embedding=[1, 2, 3], binary_embedding='101') item.full_clean() def test_get_or_create(self): @@ -308,3 +319,6 @@ def test_get_or_create(self): def test_missing(self): Item().save() assert Item.objects.first().embedding is None + assert Item.objects.first().half_embedding is None + assert Item.objects.first().binary_embedding is None + assert Item.objects.first().sparse_embedding is None From b2d1913c7cfd888687e56ee6dc9243a90b03cff9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 09:47:05 -0700 Subject: [PATCH 193/424] Improved Django tests [skip ci] --- pgvector/utils/sparsevec.py | 4 ++-- tests/test_django.py | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index ebfa0a9..96bbcdf 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -79,11 +79,11 @@ def to_binary(self): @classmethod def from_text(cls, value): - elements, dim = value.split('/') + elements, dim = value.split('/', 2) indices = [] values = [] for e in elements[1:-1].split(','): - i, v = e.split(':') + i, v = e.split(':', 2) indices.append(int(i) - 1) values.append(float(v)) return cls(int(dim), indices, values) diff --git a/tests/test_django.py b/tests/test_django.py index 4fe4000..734c1ff 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -105,6 +105,12 @@ class Meta: fields = ['binary_embedding'] +class SparseVectorForm(ModelForm): + class Meta: + model = Item + fields = ['sparse_embedding'] + + class TestDjango: def setup_method(self, test_method): Item.objects.all().delete() @@ -309,8 +315,13 @@ def test_bit_form(self): assert form.is_valid() assert 'value="101"' in form.as_div() + def test_sparsevec_form(self): + form = SparseVectorForm(data={'sparse_embedding': '{1:1,3:2,5:3}/6'}) + assert form.is_valid() + assert 'value="{1:1,3:2,5:3}/6"' in form.as_div() + def test_clean(self): - item = Item(id=1, embedding=[1, 2, 3], binary_embedding='101') + item = Item(id=1, embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector.from_dense([1, 2, 3])) item.full_clean() def test_get_or_create(self): From 404458990ebc64f2e49a5236bb2e6aed03d3c9da Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 09:52:17 -0700 Subject: [PATCH 194/424] Improved Django tests [skip ci] --- tests/test_django.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 734c1ff..7f563d0 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -315,10 +315,41 @@ def test_bit_form(self): assert form.is_valid() assert 'value="101"' in form.as_div() + def test_bit_form_instance(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + form = BitForm(instance=item) + assert 'value="101"' in form.as_div() + + def test_bit_form_save(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + form = BitForm(instance=item, data={'binary_embedding': '010'}) + assert form.has_changed() + assert form.is_valid() + assert form.save() + assert '010' == Item.objects.get(pk=1).binary_embedding + def test_sparsevec_form(self): - form = SparseVectorForm(data={'sparse_embedding': '{1:1,3:2,5:3}/6'}) + form = SparseVectorForm(data={'sparse_embedding': '{1:1,2:2,3:3}/3'}) + assert form.is_valid() + assert 'value="{1:1,2:2,3:3}/3"' in form.as_div() + + def test_sparsevec_form_instance(self): + Item(id=1, sparse_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = SparseVectorForm(instance=item) + # TODO fix + # assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() + + def test_sparsevec_form_save(self): + Item(id=1, sparse_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = SparseVectorForm(instance=item, data={'sparse_embedding': '{1:4,2:5,3:6}/3'}) + assert form.has_changed() assert form.is_valid() - assert 'value="{1:1,3:2,5:3}/6"' in form.as_div() + assert form.save() + assert [4, 5, 6] == Item.objects.get(pk=1).sparse_embedding.to_list() def test_clean(self): item = Item(id=1, embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector.from_dense([1, 2, 3])) From 47138b0aad47f196b242ea6405f9eea7c69bf256 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 09:54:43 -0700 Subject: [PATCH 195/424] Added test for halfvec form [skip ci] --- tests/test_django.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_django.py b/tests/test_django.py index 7f563d0..920bae4 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -99,6 +99,12 @@ class Meta: fields = ['embedding'] +class HalfVectorForm(ModelForm): + class Meta: + model = Item + fields = ['half_embedding'] + + class BitForm(ModelForm): class Meta: model = Item @@ -310,6 +316,11 @@ def test_vector_form_save_missing(self): assert form.save() assert Item.objects.get(pk=1).embedding is None + def test_halfvec_form(self): + form = HalfVectorForm(data={'half_embedding': '[1, 2, 3]'}) + assert form.is_valid() + assert 'value="[1, 2, 3]"' in form.as_div() + def test_bit_form(self): form = BitForm(data={'binary_embedding': '101'}) assert form.is_valid() From 58718785a747ac7ca7c574c1778b50a7d76e1c4d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 10:02:53 -0700 Subject: [PATCH 196/424] Added more tests for halfvec form [skip ci] --- tests/test_django.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_django.py b/tests/test_django.py index 920bae4..7712d7a 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -321,6 +321,22 @@ def test_halfvec_form(self): assert form.is_valid() assert 'value="[1, 2, 3]"' in form.as_div() + def test_halfvec_form_instance(self): + Item(id=1, half_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = HalfVectorForm(instance=item) + # TODO fix + # assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() + + def test_halfvec_form_save(self): + Item(id=1, half_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = HalfVectorForm(instance=item, data={'half_embedding': '[4, 5, 6]'}) + assert form.has_changed() + assert form.is_valid() + assert form.save() + assert [4, 5, 6] == Item.objects.get(pk=1).half_embedding.to_list() + def test_bit_form(self): form = BitForm(data={'binary_embedding': '101'}) assert form.is_valid() From 2dc31ebc907ba585897b761eed3c5eb2140d6707 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 10:04:47 -0700 Subject: [PATCH 197/424] Added test for halfvec form [skip ci] --- tests/test_django.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_django.py b/tests/test_django.py index 7712d7a..c2ca8e3 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -337,6 +337,15 @@ def test_halfvec_form_save(self): assert form.save() assert [4, 5, 6] == Item.objects.get(pk=1).half_embedding.to_list() + def test_halfvec_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = HalfVectorForm(instance=item, data={'half_embedding': ''}) + assert form.is_valid() + # TODO fix + # assert form.save() + assert Item.objects.get(pk=1).half_embedding is None + def test_bit_form(self): form = BitForm(data={'binary_embedding': '101'}) assert form.is_valid() From 0686c58feffd7fb26b509cc83618f4b5aed18cdf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 10:31:17 -0700 Subject: [PATCH 198/424] Improved to_python to HalfVectorField --- pgvector/django/halfvec.py | 7 ++++++- tests/test_django.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index a7921b7..7ea6fbb 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -26,7 +26,12 @@ def from_db_value(self, value, expression, connection): return HalfVector._from_db(value) def to_python(self, value): - return HalfVector._from_db(value) + if value is None or isinstance(value, HalfVector): + return value + elif isinstance(value, str): + return HalfVector._from_db(value) + else: + return HalfVector(value) def get_prep_value(self, value): return HalfVector._to_db(value) diff --git a/tests/test_django.py b/tests/test_django.py index c2ca8e3..108bd85 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -388,7 +388,7 @@ def test_sparsevec_form_save(self): assert [4, 5, 6] == Item.objects.get(pk=1).sparse_embedding.to_list() def test_clean(self): - item = Item(id=1, embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector.from_dense([1, 2, 3])) + item = Item(id=1, embedding=[1, 2, 3], half_embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector.from_dense([1, 2, 3])) item.full_clean() def test_get_or_create(self): From 1e2006a8138b674fe3d4a6080396b081096c9d84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 15:00:14 -0700 Subject: [PATCH 199/424] Fixed HalfVectorField forms for Django - fixes #75 --- pgvector/django/halfvec.py | 20 ++++++++++++++++++++ tests/test_django.py | 6 ++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 7ea6fbb..6b59a7f 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -1,3 +1,4 @@ +from django import forms from django.db.models import Field from ..utils import HalfVector @@ -38,3 +39,22 @@ def get_prep_value(self, value): def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) + + def formfield(self, **kwargs): + return super().formfield(form_class=HalfVectorFormField, **kwargs) + + +class HalfVectorWidget(forms.TextInput): + def format_value(self, value): + if isinstance(value, HalfVector): + value = value.to_list() + return super().format_value(value) + + +class HalfVectorFormField(forms.CharField): + widget = HalfVectorWidget + + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/tests/test_django.py b/tests/test_django.py index 108bd85..e76f77a 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -325,8 +325,7 @@ def test_halfvec_form_instance(self): Item(id=1, half_embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) form = HalfVectorForm(instance=item) - # TODO fix - # assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() + assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() def test_halfvec_form_save(self): Item(id=1, half_embedding=[1, 2, 3]).save() @@ -342,8 +341,7 @@ def test_halfvec_form_save_missing(self): item = Item.objects.get(pk=1) form = HalfVectorForm(instance=item, data={'half_embedding': ''}) assert form.is_valid() - # TODO fix - # assert form.save() + assert form.save() assert Item.objects.get(pk=1).half_embedding is None def test_bit_form(self): From 8d6da7465055102b246cbed9f1dd01e3f8edabf5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 15:08:20 -0700 Subject: [PATCH 200/424] Improved SparseVectorField forms for Django --- pgvector/django/sparsevec.py | 11 +++++++++++ tests/test_django.py | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index 4ec734f..6633b23 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -1,3 +1,4 @@ +from django import forms from django.db.models import Field from ..utils import SparseVector @@ -33,3 +34,13 @@ def get_prep_value(self, value): def value_to_string(self, obj): return self.get_prep_value(self.value_from_object(obj)) + + def formfield(self, **kwargs): + return super().formfield(form_class=SparseVectorFormField, **kwargs) + + +class SparseVectorFormField(forms.CharField): + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/tests/test_django.py b/tests/test_django.py index e76f77a..5692d2a 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -385,6 +385,14 @@ def test_sparsevec_form_save(self): assert form.save() assert [4, 5, 6] == Item.objects.get(pk=1).sparse_embedding.to_list() + def test_sparesevec_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = SparseVectorForm(instance=item, data={'sparse_embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).sparse_embedding is None + def test_clean(self): item = Item(id=1, embedding=[1, 2, 3], half_embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector.from_dense([1, 2, 3])) item.full_clean() From 6d41b794b64e91a760cdb6268e1a8d87870b09e4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 15:10:30 -0700 Subject: [PATCH 201/424] Improved SparseVectorField forms for Django [skip ci] --- pgvector/django/sparsevec.py | 9 +++++++++ tests/test_django.py | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index 6633b23..d0d2d07 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -39,7 +39,16 @@ def formfield(self, **kwargs): return super().formfield(form_class=SparseVectorFormField, **kwargs) +class SparseVectorWidget(forms.TextInput): + def format_value(self, value): + if isinstance(value, SparseVector): + value = value.to_text() + return super().format_value(value) + + class SparseVectorFormField(forms.CharField): + widget = SparseVectorWidget + def to_python(self, value): if isinstance(value, str) and value == '': return None diff --git a/tests/test_django.py b/tests/test_django.py index 5692d2a..bd229fe 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -373,8 +373,8 @@ def test_sparsevec_form_instance(self): Item(id=1, sparse_embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) form = SparseVectorForm(instance=item) - # TODO fix - # assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() + # TODO improve + assert 'value="{1:1.0,2:2.0,3:3.0}/3"' in form.as_div() def test_sparsevec_form_save(self): Item(id=1, sparse_embedding=[1, 2, 3]).save() From d4469b97327241afb2e88f3b12bcb38a8def362b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 12 Jun 2024 15:16:50 -0700 Subject: [PATCH 202/424] Improved BitField forms for Django [skip ci] --- pgvector/django/bit.py | 11 +++++++++++ tests/test_django.py | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/pgvector/django/bit.py b/pgvector/django/bit.py index 941d694..2cc847a 100644 --- a/pgvector/django/bit.py +++ b/pgvector/django/bit.py @@ -1,3 +1,4 @@ +from django import forms from django.db.models import Field @@ -19,3 +20,13 @@ def db_type(self, connection): if self.length is None: return 'bit' return 'bit(%d)' % self.length + + def formfield(self, **kwargs): + return super().formfield(form_class=BitFormField, **kwargs) + + +class BitFormField(forms.CharField): + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/tests/test_django.py b/tests/test_django.py index bd229fe..186cf3a 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -364,6 +364,14 @@ def test_bit_form_save(self): assert form.save() assert '010' == Item.objects.get(pk=1).binary_embedding + def test_bit_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = BitForm(instance=item, data={'binary_embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).binary_embedding is None + def test_sparsevec_form(self): form = SparseVectorForm(data={'sparse_embedding': '{1:1,2:2,3:3}/3'}) assert form.is_valid() From aba997c0e3a3f2df706ffb082ac17d6f2b6f1a28 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Jun 2024 10:29:02 -0700 Subject: [PATCH 203/424] Updated test name [skip ci] --- tests/test_sparse_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index c4565fb..1286dbb 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -10,7 +10,7 @@ def test_from_dense(self): assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] assert SparseVector.from_dense(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] - def test_from_coordinates(self): + def test_from_dict(self): assert SparseVector.from_dict({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] def test_from_sparse(self): From 50def9c996b0a90d7d4e4dccf6807b03a179ff13 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 16:22:41 -0700 Subject: [PATCH 204/424] Simplified SparseVector construction --- examples/sparse_search.py | 4 +- pgvector/utils/sparsevec.py | 83 ++++++++++++++++++++----------------- tests/test_asyncpg.py | 2 +- tests/test_django.py | 18 ++++---- tests/test_peewee.py | 8 ++-- tests/test_psycopg.py | 12 +++--- tests/test_psycopg2.py | 2 +- tests/test_sparse_vector.py | 37 ++++++++++++----- tests/test_sqlalchemy.py | 6 +-- tests/test_sqlmodel.py | 6 +-- 10 files changed, 100 insertions(+), 78 deletions(-) diff --git a/examples/sparse_search.py b/examples/sparse_search.py index 7e786e3..6ce33e8 100644 --- a/examples/sparse_search.py +++ b/examples/sparse_search.py @@ -45,10 +45,10 @@ def fetch_embeddings(input): ] embeddings = fetch_embeddings(input) for content, embedding in zip(input, embeddings): - conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector.from_dense(embedding))) + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector(embedding))) query = 'forest' query_embedding = fetch_embeddings([query])[0] -result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector.from_dense(query_embedding),)).fetchall() +result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector(query_embedding),)).fetchall() for row in result: print(row[0]) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 96bbcdf..e6d7f87 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -3,48 +3,54 @@ class SparseVector: - def __init__(self, dim, indices, values): - # TODO improve - self._dim = int(dim) - self._indices = [int(i) for i in indices] - self._values = [float(v) for v in values] + def __init__(self, value, dimensions=None): + if value.__class__.__module__ == 'scipy.sparse._arrays': + if dimensions is not None: + raise ValueError('dimensions not allowed') + + self._from_sparse(value) + elif isinstance(value, dict): + self._from_dict(value, dimensions) + else: + if dimensions is not None: + raise ValueError('dimensions not allowed') + + self._from_dense(value) def __repr__(self): - return f'SparseVector({self._dim}, {self._indices}, {self._values})' + return f'SparseVector({self.to_dict()}, {self.dim()})' + + def _from_dict(self, d, dim): + if dim is None: + raise ValueError('dimensions required') - @classmethod - def from_dict(cls, d, dim): elements = [(i, v) for i, v in d.items()] elements.sort() - indices = [int(v[0]) for v in elements] - values = [float(v[1]) for v in elements] - return cls(dim, indices, values) + self._dim = int(dim) + self._indices = [int(v[0]) for v in elements] + self._values = [float(v[1]) for v in elements] - @classmethod - def from_sparse(cls, value): + def _from_sparse(self, value): value = value.tocoo() if value.ndim == 1: - dim = value.shape[0] + self._dim = value.shape[0] elif value.ndim == 2 and value.shape[0] == 1: - dim = value.shape[1] + self._dim = value.shape[1] else: raise ValueError('expected ndim to be 1') if hasattr(value, 'coords'): # scipy 1.13+ - indices = value.coords[0].tolist() + self._indices = value.coords[0].tolist() else: - indices = value.col.tolist() - values = value.data.tolist() - return cls(dim, indices, values) + self._indices = value.col.tolist() + self._values = value.data.tolist() - @classmethod - def from_dense(cls, value): - dim = len(value) - indices = [i for i, v in enumerate(value) if v != 0] - values = [float(value[i]) for i in indices] - return cls(dim, indices, values) + def _from_dense(self, value): + self._dim = len(value) + self._indices = [i for i, v in enumerate(value) if v != 0] + self._values = [float(value[i]) for i in self._indices] def dim(self): return self._dim @@ -86,21 +92,30 @@ def from_text(cls, value): i, v = e.split(':', 2) indices.append(int(i) - 1) values.append(float(v)) - return cls(int(dim), indices, values) + return cls._from_parts(int(dim), indices, values) @classmethod def from_binary(cls, value): dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) - return cls(int(dim), indices, values) + return cls._from_parts(int(dim), indices, values) + + @classmethod + def _from_parts(cls, dim, indices, values): + vec = cls.__new__(cls) + vec._dim = dim + vec._indices = indices + vec._values = values + return vec @classmethod def _to_db(cls, value, dim=None): if value is None: return value - value = cls._to_db_value(value) + if not isinstance(value, cls): + value = cls(value) if dim is not None and value.dim() != dim: raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) @@ -112,19 +127,11 @@ def _to_db_binary(cls, value): if value is None: return value - value = cls._to_db_value(value) + if not isinstance(value, cls): + value = cls(value) return value.to_binary() - @classmethod - def _to_db_value(cls, value): - if isinstance(value, cls): - return value - elif isinstance(value, (list, np.ndarray)): - return cls.from_dense(value) - else: - raise ValueError('expected sparsevec') - @classmethod def _from_db(cls, value): if value is None or isinstance(value, cls): diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 3bfc888..829883e 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -82,7 +82,7 @@ async def test_sparsevec(self): await register_vector(conn) - embedding = SparseVector.from_dense([1.5, 2, 3]) + embedding = SparseVector([1.5, 2, 3]) await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") diff --git a/tests/test_django.py b/tests/test_django.py index 186cf3a..421966f 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -88,9 +88,9 @@ class Migration(migrations.Migration): def create_items(): - Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1])).save() - Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2])).save() - Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2])).save() + Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1])).save() + Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2])).save() + Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2])).save() class VectorForm(ModelForm): @@ -208,34 +208,34 @@ def test_bit_jaccard_distance(self): # assert [v.distance for v in items] == [0, 1/3, 1] def test_sparsevec(self): - Item(id=1, sparse_embedding=SparseVector.from_dense([1, 2, 3])).save() + Item(id=1, sparse_embedding=SparseVector([1, 2, 3])).save() item = Item.objects.get(pk=1) assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() - distance = L2Distance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) + distance = L2Distance('sparse_embedding', SparseVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] def test_sparsevec_max_inner_product(self): create_items() - distance = MaxInnerProduct('sparse_embedding', SparseVector.from_dense([1, 1, 1])) + distance = MaxInnerProduct('sparse_embedding', SparseVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] def test_sparsevec_cosine_distance(self): create_items() - distance = CosineDistance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) + distance = CosineDistance('sparse_embedding', SparseVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] def test_sparsevec_l1_distance(self): create_items() - distance = L1Distance('sparse_embedding', SparseVector.from_dense([1, 1, 1])) + distance = L1Distance('sparse_embedding', SparseVector([1, 1, 1])) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, 3] @@ -402,7 +402,7 @@ def test_sparesevec_form_save_missing(self): assert Item.objects.get(pk=1).sparse_embedding is None def test_clean(self): - item = Item(id=1, embedding=[1, 2, 3], half_embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector.from_dense([1, 2, 3])) + item = Item(id=1, embedding=[1, 2, 3], half_embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector([1, 2, 3])) item.full_clean() def test_get_or_create(self): diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 1455303..0882890 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -30,9 +30,9 @@ class Meta: def create_items(): - Item.create(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1])) - Item.create(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2])) - Item.create(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2])) + Item.create(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1])) + Item.create(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2])) + Item.create(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2])) class TestPeewee: @@ -132,7 +132,7 @@ def test_sparsevec(self): def test_sparsevec_l2_distance(self): create_items() - distance = Item.sparse_embedding.l2_distance(SparseVector.from_dense([1, 1, 1])) + distance = Item.sparse_embedding.l2_distance(SparseVector([1, 1, 1])) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 2de1ec7..79ac190 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -100,20 +100,20 @@ def test_bit_text_format(self): assert repr(Bit(res)) == 'Bit(010100001)' def test_sparsevec(self): - embedding = SparseVector.from_dense([1.5, 2, 3]) + embedding = SparseVector([1.5, 2, 3]) conn.execute('INSERT INTO psycopg_items (sparse_embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT sparse_embedding FROM psycopg_items ORDER BY id').fetchone()[0] assert res.to_list() == [1.5, 2, 3] def test_sparsevec_binary_format(self): - embedding = SparseVector.from_dense([1.5, 0, 2, 0, 3, 0]) + embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_sparsevec_text_format(self): - embedding = SparseVector.from_dense([1.5, 0, 2, 0, 3, 0]) + embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) @@ -122,20 +122,20 @@ def test_text_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: - copy.write_row([embedding, HalfVector(embedding), '101', SparseVector.from_dense(embedding)]) + copy.write_row([embedding, HalfVector(embedding), '101', SparseVector(embedding)]) def test_binary_copy(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.write_row([embedding, HalfVector(embedding), Bit('101'), SparseVector.from_dense(embedding)]) + copy.write_row([embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) def test_binary_copy_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (id, embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.set_types(['int8', 'vector', 'halfvec', 'bit', 'sparsevec']) - copy.write_row([1, embedding, HalfVector(embedding), Bit('101'), SparseVector.from_dense(embedding)]) + copy.write_row([1, embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) @pytest.mark.asyncio async def test_async(self): diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index f18405f..54da6a7 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -46,7 +46,7 @@ def test_bit(self): assert res[1][0] is None def test_sparsevec(self): - embedding = SparseVector.from_dense([1.5, 2, 3]) + embedding = SparseVector([1.5, 2, 3]) cur.execute('INSERT INTO psycopg2_items (sparse_embedding) VALUES (%s), (NULL)', (embedding,)) cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 1286dbb..ae38e2a 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -6,27 +6,42 @@ class TestSparseVector: def test_from_dense(self): - assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_dense(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector([1, 0, 2, 0, 3, 0]).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] + assert SparseVector(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] + + def test_from_dense_dimensions(self): + with pytest.raises(ValueError) as error: + SparseVector([1, 0, 2, 0, 3, 0], 6) + assert str(error.value) == 'dimensions not allowed' def test_from_dict(self): - assert SparseVector.from_dict({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] + + def test_from_dict_no_dimensions(self): + with pytest.raises(ValueError) as error: + SparseVector({0: 1, 2: 2, 4: 3}) + assert str(error.value) == 'dimensions required' def test_from_sparse(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) - assert SparseVector.from_sparse(arr).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector.from_sparse(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector(arr).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] + + def test_from_sparse_dimensions(self): + with pytest.raises(ValueError) as error: + SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) + assert str(error.value) == 'dimensions not allowed' def test_repr(self): - assert repr(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' - assert str(SparseVector.from_dense([1, 0, 2, 0, 3, 0])) == 'SparseVector(6, [0, 2, 4], [1.0, 2.0, 3.0])' + assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' + assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' def test_dim(self): - assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).dim() == 6 + assert SparseVector([1, 0, 2, 0, 3, 0]).dim() == 6 def test_to_dict(self): - assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_dict() == {0: 1, 2: 2, 4: 3} + assert SparseVector([1, 0, 2, 0, 3, 0]).to_dict() == {0: 1, 2: 2, 4: 3} def test_to_coo(self): - assert SparseVector.from_dense([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] + assert SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 1c0fb80..edce3dc 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -41,9 +41,9 @@ class Item(Base): def create_items(): session = Session(engine) - session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1]))) - session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2]))) - session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2]))) + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) session.commit() diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 90f7e21..5685ce6 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -37,9 +37,9 @@ class Item(SQLModel, table=True): def create_items(): session = Session(engine) - session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector.from_dense([1, 1, 1]))) - session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector.from_dense([2, 2, 2]))) - session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector.from_dense([1, 1, 2]))) + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) session.commit() From 7088b11380ca5e8ab57cdf7f4e5a1ce180dfd1c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 16:26:23 -0700 Subject: [PATCH 205/424] Renamed dim method to dimensions [skip ci] --- pgvector/utils/halfvec.py | 8 ++++---- pgvector/utils/sparsevec.py | 8 ++++---- pgvector/utils/vector.py | 8 ++++---- tests/test_half_vector.py | 4 ++-- tests/test_sparse_vector.py | 4 ++-- tests/test_vector.py | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index d01b381..e1e5051 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -16,7 +16,7 @@ def __init__(self, value): def __repr__(self): return f'HalfVector({self.to_list()})' - def dim(self): + def dimensions(self): return len(self._value) def to_list(self): @@ -29,7 +29,7 @@ def to_text(self): return '[' + ','.join([str(float(v)) for v in self._value]) + ']' def to_binary(self): - return pack('>HH', self.dim(), 0) + self._value.tobytes() + return pack('>HH', self.dimensions(), 0) + self._value.tobytes() @classmethod def from_text(cls, value): @@ -48,8 +48,8 @@ def _to_db(cls, value, dim=None): if not isinstance(value, cls): value = cls(value) - if dim is not None and value.dim() != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) + if dim is not None and value.dimensions() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dimensions())) return value.to_text() diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index e6d7f87..d2fff13 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -18,7 +18,7 @@ def __init__(self, value, dimensions=None): self._from_dense(value) def __repr__(self): - return f'SparseVector({self.to_dict()}, {self.dim()})' + return f'SparseVector({self.to_dict()}, {self.dimensions()})' def _from_dict(self, d, dim): if dim is None: @@ -52,7 +52,7 @@ def _from_dense(self, value): self._indices = [i for i, v in enumerate(value) if v != 0] self._values = [float(value[i]) for i in self._indices] - def dim(self): + def dimensions(self): return self._dim def to_dict(self): @@ -117,8 +117,8 @@ def _to_db(cls, value, dim=None): if not isinstance(value, cls): value = cls(value) - if dim is not None and value.dim() != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) + if dim is not None and value.dimensions() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dimensions())) return value.to_text() diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 7a0f6bc..3fa2f35 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -16,7 +16,7 @@ def __init__(self, value): def __repr__(self): return f'Vector({self.to_list()})' - def dim(self): + def dimensions(self): return len(self._value) def to_list(self): @@ -29,7 +29,7 @@ def to_text(self): return '[' + ','.join([str(float(v)) for v in self._value]) + ']' def to_binary(self): - return pack('>HH', self.dim(), 0) + self._value.tobytes() + return pack('>HH', self.dimensions(), 0) + self._value.tobytes() @classmethod def from_text(cls, value): @@ -48,8 +48,8 @@ def _to_db(cls, value, dim=None): if not isinstance(value, cls): value = cls(value) - if dim is not None and value.dim() != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, value.dim())) + if dim is not None and value.dimensions() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dimensions())) return value.to_text() diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index b2811b2..2a0d3a3 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -38,5 +38,5 @@ def test_repr(self): assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' assert str(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' - def test_dim(self): - assert HalfVector([1, 2, 3]).dim() == 3 + def test_dimensions(self): + assert HalfVector([1, 2, 3]).dimensions() == 3 diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index ae38e2a..6a49f57 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -37,8 +37,8 @@ def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' - def test_dim(self): - assert SparseVector([1, 0, 2, 0, 3, 0]).dim() == 6 + def test_dimensions(self): + assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 def test_to_dict(self): assert SparseVector([1, 0, 2, 0, 3, 0]).to_dict() == {0: 1, 2: 2, 4: 3} diff --git a/tests/test_vector.py b/tests/test_vector.py index d6ecb5b..90d6a9b 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -38,5 +38,5 @@ def test_repr(self): assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' assert str(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' - def test_dim(self): - assert Vector([1, 2, 3]).dim() == 3 + def test_dimensions(self): + assert Vector([1, 2, 3]).dimensions() == 3 From 88872a91a3fac052381ea6fd00d8068e066aaf47 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 16:29:21 -0700 Subject: [PATCH 206/424] Added indices and values methods to SparseVector [skip ci] --- pgvector/utils/sparsevec.py | 10 +++++++--- tests/test_sparse_vector.py | 7 +++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index d2fff13..041e727 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -18,7 +18,8 @@ def __init__(self, value, dimensions=None): self._from_dense(value) def __repr__(self): - return f'SparseVector({self.to_dict()}, {self.dimensions()})' + elements = dict(zip(self._indices, self._values)) + return f'SparseVector({elements}, {self.dimensions()})' def _from_dict(self, d, dim): if dim is None: @@ -55,8 +56,11 @@ def _from_dense(self, value): def dimensions(self): return self._dim - def to_dict(self): - return dict(zip(self._indices, self._values)) + def indices(self): + return self._indices + + def values(self): + return self._values def to_coo(self): from scipy.sparse import coo_array diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 6a49f57..f03e270 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -40,8 +40,11 @@ def test_repr(self): def test_dimensions(self): assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 - def test_to_dict(self): - assert SparseVector([1, 0, 2, 0, 3, 0]).to_dict() == {0: 1, 2: 2, 4: 3} + def test_indices(self): + assert SparseVector([1, 0, 2, 0, 3, 0]).indices() == [0, 2, 4] + + def test_values(self): + assert SparseVector([1, 0, 2, 0, 3, 0]).values() == [1, 2, 3] def test_to_coo(self): assert SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] From 6fa839d32de6f8a69ca01896bbc45bb48afa18af Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 16:29:57 -0700 Subject: [PATCH 207/424] Improved test [skip ci] --- tests/test_sparse_vector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index f03e270..59c3844 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -16,7 +16,8 @@ def test_from_dense_dimensions(self): assert str(error.value) == 'dimensions not allowed' def test_from_dict(self): - assert SparseVector({0: 1, 2: 2, 4: 3}, 6).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6).to_list() == [1, 0, 2, 0, 3, 0] + assert SparseVector([1, 0, 2, 0, 3, 0]).indices() == [0, 2, 4] def test_from_dict_no_dimensions(self): with pytest.raises(ValueError) as error: From dfa3e98c4a0829bd9d606acb3139007865c36e62 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 16:32:55 -0700 Subject: [PATCH 208/424] Moved methods [skip ci] --- pgvector/utils/sparsevec.py | 64 ++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 041e727..2060b2b 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -21,38 +21,6 @@ def __repr__(self): elements = dict(zip(self._indices, self._values)) return f'SparseVector({elements}, {self.dimensions()})' - def _from_dict(self, d, dim): - if dim is None: - raise ValueError('dimensions required') - - elements = [(i, v) for i, v in d.items()] - elements.sort() - self._dim = int(dim) - self._indices = [int(v[0]) for v in elements] - self._values = [float(v[1]) for v in elements] - - def _from_sparse(self, value): - value = value.tocoo() - - if value.ndim == 1: - self._dim = value.shape[0] - elif value.ndim == 2 and value.shape[0] == 1: - self._dim = value.shape[1] - else: - raise ValueError('expected ndim to be 1') - - if hasattr(value, 'coords'): - # scipy 1.13+ - self._indices = value.coords[0].tolist() - else: - self._indices = value.col.tolist() - self._values = value.data.tolist() - - def _from_dense(self, value): - self._dim = len(value) - self._indices = [i for i, v in enumerate(value) if v != 0] - self._values = [float(value[i]) for i in self._indices] - def dimensions(self): return self._dim @@ -87,6 +55,38 @@ def to_binary(self): nnz = len(self._indices) return pack(f'>iii{nnz}i{nnz}f', self._dim, nnz, 0, *self._indices, *self._values) + def _from_dict(self, d, dim): + if dim is None: + raise ValueError('dimensions required') + + elements = [(i, v) for i, v in d.items()] + elements.sort() + self._dim = int(dim) + self._indices = [int(v[0]) for v in elements] + self._values = [float(v[1]) for v in elements] + + def _from_sparse(self, value): + value = value.tocoo() + + if value.ndim == 1: + self._dim = value.shape[0] + elif value.ndim == 2 and value.shape[0] == 1: + self._dim = value.shape[1] + else: + raise ValueError('expected ndim to be 1') + + if hasattr(value, 'coords'): + # scipy 1.13+ + self._indices = value.coords[0].tolist() + else: + self._indices = value.col.tolist() + self._values = value.data.tolist() + + def _from_dense(self, value): + self._dim = len(value) + self._indices = [i for i, v in enumerate(value) if v != 0] + self._values = [float(value[i]) for i in self._indices] + @classmethod def from_text(cls, value): elements, dim = value.split('/', 2) From 0266a2ee38e3812bc66ba2a96d9b8bd73b58f42e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 18:39:25 -0700 Subject: [PATCH 209/424] Fixed dict constructor for SparseVector [skip ci] --- pgvector/utils/sparsevec.py | 2 +- tests/test_sparse_vector.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 2060b2b..7d1839f 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -59,7 +59,7 @@ def _from_dict(self, d, dim): if dim is None: raise ValueError('dimensions required') - elements = [(i, v) for i, v in d.items()] + elements = [(i, v) for i, v in d.items() if v != 0] elements.sort() self._dim = int(dim) self._indices = [int(v[0]) for v in elements] diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 59c3844..1fc70ba 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -16,8 +16,9 @@ def test_from_dense_dimensions(self): assert str(error.value) == 'dimensions not allowed' def test_from_dict(self): - assert SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector([1, 0, 2, 0, 3, 0]).indices() == [0, 2, 4] + vec = SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] def test_from_dict_no_dimensions(self): with pytest.raises(ValueError) as error: From 1bc26e40545dfe6719354f48d34d7f403c6cf627 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 18:40:02 -0700 Subject: [PATCH 210/424] Improved code [skip ci] --- pgvector/utils/sparsevec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 7d1839f..5d62554 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -61,6 +61,7 @@ def _from_dict(self, d, dim): elements = [(i, v) for i, v in d.items() if v != 0] elements.sort() + self._dim = int(dim) self._indices = [int(v[0]) for v in elements] self._values = [float(v[1]) for v in elements] From ac8680e0e5851e30ac8e843a2bcc8187d37a5fc0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 18:43:34 -0700 Subject: [PATCH 211/424] Require dimensions to be positional [skip ci] --- pgvector/utils/sparsevec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 5d62554..c21e0d3 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -3,7 +3,7 @@ class SparseVector: - def __init__(self, value, dimensions=None): + def __init__(self, value, dimensions=None, /): if value.__class__.__module__ == 'scipy.sparse._arrays': if dimensions is not None: raise ValueError('dimensions not allowed') From ae8402fbc122a7d496007bb3bb62b611956025d2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 18:49:26 -0700 Subject: [PATCH 212/424] Improved tests [skip ci] --- tests/test_sparse_vector.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 1fc70ba..023493b 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -5,16 +5,19 @@ class TestSparseVector: - def test_from_dense(self): - assert SparseVector([1, 0, 2, 0, 3, 0]).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector([1, 0, 2, 0, 3, 0]).to_numpy().tolist() == [1, 0, 2, 0, 3, 0] - assert SparseVector(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] + def test_from_list(self): + vec = SparseVector([1, 0, 2, 0, 3, 0]) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.to_numpy().tolist() == [1, 0, 2, 0, 3, 0] - def test_from_dense_dimensions(self): + def test_from_list_dimensions(self): with pytest.raises(ValueError) as error: SparseVector([1, 0, 2, 0, 3, 0], 6) assert str(error.value) == 'dimensions not allowed' + def test_from_numpy(self): + assert SparseVector(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] + def test_from_dict(self): vec = SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) assert vec.to_list() == [1, 0, 2, 0, 3, 0] @@ -25,12 +28,12 @@ def test_from_dict_no_dimensions(self): SparseVector({0: 1, 2: 2, 4: 3}) assert str(error.value) == 'dimensions required' - def test_from_sparse(self): + def test_from_scipy(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) assert SparseVector(arr).to_list() == [1, 0, 2, 0, 3, 0] assert SparseVector(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] - def test_from_sparse_dimensions(self): + def test_from_scipy_dimensions(self): with pytest.raises(ValueError) as error: SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) assert str(error.value) == 'dimensions not allowed' From 2b89e1688dc196bdd851d56331598783a9456fa5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 20 Jun 2024 18:54:55 -0700 Subject: [PATCH 213/424] Improved tests [skip ci] --- tests/test_sparse_vector.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 023493b..064f115 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -5,39 +5,49 @@ class TestSparseVector: - def test_from_list(self): + def test_list(self): vec = SparseVector([1, 0, 2, 0, 3, 0]) assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.to_numpy().tolist() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] - def test_from_list_dimensions(self): + def test_list_dimensions(self): with pytest.raises(ValueError) as error: SparseVector([1, 0, 2, 0, 3, 0], 6) assert str(error.value) == 'dimensions not allowed' - def test_from_numpy(self): - assert SparseVector(np.array([1, 0, 2, 0, 3, 0])).to_list() == [1, 0, 2, 0, 3, 0] + def test_ndarray(self): + vec = SparseVector(np.array([1, 0, 2, 0, 3, 0])) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] - def test_from_dict(self): + def test_dict(self): vec = SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] - def test_from_dict_no_dimensions(self): + def test_dict_no_dimensions(self): with pytest.raises(ValueError) as error: SparseVector({0: 1, 2: 2, 4: 3}) assert str(error.value) == 'dimensions required' - def test_from_scipy(self): + def test_coo_array(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) - assert SparseVector(arr).to_list() == [1, 0, 2, 0, 3, 0] - assert SparseVector(arr.todok()).to_list() == [1, 0, 2, 0, 3, 0] + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] - def test_from_scipy_dimensions(self): + def test_coo_array_dimensions(self): with pytest.raises(ValueError) as error: SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) assert str(error.value) == 'dimensions not allowed' + def test_dok_array(self): + arr = coo_array(np.array([1, 0, 2, 0, 3, 0])).todok() + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' From 0cf5ca5adb0abf3487a23698e91b8afee8e9e486 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 25 Jun 2024 16:46:14 -0700 Subject: [PATCH 214/424] Improved code --- pgvector/utils/sparsevec.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index c21e0d3..a451fc6 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -1,18 +1,23 @@ import numpy as np from struct import pack, unpack_from +NO_DEFAULT = object() + class SparseVector: - def __init__(self, value, dimensions=None, /): + def __init__(self, value, dimensions=NO_DEFAULT, /): if value.__class__.__module__ == 'scipy.sparse._arrays': - if dimensions is not None: + if dimensions is not NO_DEFAULT: raise ValueError('dimensions not allowed') self._from_sparse(value) elif isinstance(value, dict): + if dimensions is NO_DEFAULT: + raise ValueError('dimensions required') + self._from_dict(value, dimensions) else: - if dimensions is not None: + if dimensions is not NO_DEFAULT: raise ValueError('dimensions not allowed') self._from_dense(value) @@ -56,9 +61,6 @@ def to_binary(self): return pack(f'>iii{nnz}i{nnz}f', self._dim, nnz, 0, *self._indices, *self._values) def _from_dict(self, d, dim): - if dim is None: - raise ValueError('dimensions required') - elements = [(i, v) for i, v in d.items() if v != 0] elements.sort() From 1451cc60b995ac304f972284741a7a8cc28b1171 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 25 Jun 2024 17:04:54 -0700 Subject: [PATCH 215/424] Fixed check --- pgvector/utils/sparsevec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index a451fc6..632962f 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -6,7 +6,7 @@ class SparseVector: def __init__(self, value, dimensions=NO_DEFAULT, /): - if value.__class__.__module__ == 'scipy.sparse._arrays': + if value.__class__.__module__.startswith('scipy.sparse.'): if dimensions is not NO_DEFAULT: raise ValueError('dimensions not allowed') From e005b45a29c985c5908d9fbf3ceb6d6bfb725f71 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 25 Jun 2024 17:07:28 -0700 Subject: [PATCH 216/424] Improved code [skip ci] --- pgvector/utils/sparsevec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 632962f..0c35d25 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -24,7 +24,7 @@ def __init__(self, value, dimensions=NO_DEFAULT, /): def __repr__(self): elements = dict(zip(self._indices, self._values)) - return f'SparseVector({elements}, {self.dimensions()})' + return f'SparseVector({elements}, {self._dim})' def dimensions(self): return self._dim From 12370bb0b4d7e42c6ebee30f0eed360bf3c65f1d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 25 Jun 2024 17:45:39 -0700 Subject: [PATCH 217/424] Improved error messages --- pgvector/utils/sparsevec.py | 6 +++--- tests/test_sparse_vector.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 0c35d25..1c81876 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -8,17 +8,17 @@ class SparseVector: def __init__(self, value, dimensions=NO_DEFAULT, /): if value.__class__.__module__.startswith('scipy.sparse.'): if dimensions is not NO_DEFAULT: - raise ValueError('dimensions not allowed') + raise ValueError('extra argument') self._from_sparse(value) elif isinstance(value, dict): if dimensions is NO_DEFAULT: - raise ValueError('dimensions required') + raise ValueError('missing dimensions') self._from_dict(value, dimensions) else: if dimensions is not NO_DEFAULT: - raise ValueError('dimensions not allowed') + raise ValueError('extra argument') self._from_dense(value) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 064f115..643e841 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -14,7 +14,7 @@ def test_list(self): def test_list_dimensions(self): with pytest.raises(ValueError) as error: SparseVector([1, 0, 2, 0, 3, 0], 6) - assert str(error.value) == 'dimensions not allowed' + assert str(error.value) == 'extra argument' def test_ndarray(self): vec = SparseVector(np.array([1, 0, 2, 0, 3, 0])) @@ -29,7 +29,7 @@ def test_dict(self): def test_dict_no_dimensions(self): with pytest.raises(ValueError) as error: SparseVector({0: 1, 2: 2, 4: 3}) - assert str(error.value) == 'dimensions required' + assert str(error.value) == 'missing dimensions' def test_coo_array(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) @@ -40,7 +40,7 @@ def test_coo_array(self): def test_coo_array_dimensions(self): with pytest.raises(ValueError) as error: SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) - assert str(error.value) == 'dimensions not allowed' + assert str(error.value) == 'extra argument' def test_dok_array(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])).todok() From 3eadd987c169247a61dbc3a025552e81cd02c61e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 25 Jun 2024 18:01:38 -0700 Subject: [PATCH 218/424] Version bump to 0.3.0 [skip ci] --- CHANGELOG.md | 2 +- README.md | 18 +++++++++--------- setup.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14db629..999a859 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.0 (unreleased) +## 0.3.0 (2024-06-25) - Added support for `halfvec`, `bit`, and `sparsevec` types to Django - Added support for `halfvec`, `bit`, and `sparsevec` types to SQLAlchemy and SQLModel diff --git a/README.md b/README.md index cf39d7e..670c6ea 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Or check out some examples: - [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_embeddings.py) with SentenceTransformers - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) -- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search.py) with Transformers (unreleased) +- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search.py) with Transformers - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing - [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit @@ -61,7 +61,7 @@ class Item(models.Model): embedding = VectorField(dimensions=3) ``` -Also supports `HalfVectorField` (unreleased), `BitField` (unreleased), and `SparseVectorField` (unreleased) +Also supports `HalfVectorField`, `BitField`, and `SparseVectorField` Insert a vector @@ -78,7 +78,7 @@ from pgvector.django import L2Distance Item.objects.order_by(L2Distance('embedding', [3, 1, 2]))[:5] ``` -Also supports `MaxInnerProduct`, `CosineDistance`, `L1Distance` (unreleased), `HammingDistance` (unreleased), and `JaccardDistance` (unreleased) +Also supports `MaxInnerProduct`, `CosineDistance`, `L1Distance`, `HammingDistance`, and `JaccardDistance` Get the distance @@ -146,7 +146,7 @@ class Item(Base): embedding = mapped_column(Vector(3)) ``` -Also supports `HALFVEC` (unreleased), `BIT` (unreleased), and `SPARSEVEC` (unreleased) +Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` Insert a vector @@ -162,7 +162,7 @@ Get the nearest neighbors to a vector session.scalars(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5)) ``` -Also supports `max_inner_product`, `cosine_distance`, `l1_distance` (unreleased), `hamming_distance` (unreleased), and `jaccard_distance` (unreleased) +Also supports `max_inner_product`, `cosine_distance`, `l1_distance`, `hamming_distance`, and `jaccard_distance` Get the distance @@ -228,7 +228,7 @@ class Item(SQLModel, table=True): embedding: Any = Field(sa_column=Column(Vector(3))) ``` -Also supports `HALFVEC` (unreleased), `BIT` (unreleased), and `SPARSEVEC` (unreleased) +Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` Insert a vector @@ -244,7 +244,7 @@ Get the nearest neighbors to a vector session.exec(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5)) ``` -Also supports `max_inner_product`, `cosine_distance`, `l1_distance` (unreleased), `hamming_distance` (unreleased), and `jaccard_distance` (unreleased) +Also supports `max_inner_product`, `cosine_distance`, `l1_distance`, `hamming_distance`, and `jaccard_distance` Get the distance @@ -459,7 +459,7 @@ class Item(BaseModel): embedding = VectorField(dimensions=3) ``` -Also supports `HalfVectorField` (unreleased), `FixedBitField` (unreleased), and `SparseVectorField` (unreleased) +Also supports `HalfVectorField`, `FixedBitField`, and `SparseVectorField` Insert a vector @@ -473,7 +473,7 @@ Get the nearest neighbors to a vector Item.select().order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5) ``` -Also supports `max_inner_product`, `cosine_distance`, `l1_distance` (unreleased), `hamming_distance` (unreleased), and `jaccard_distance` (unreleased) +Also supports `max_inner_product`, `cosine_distance`, `l1_distance`, `hamming_distance`, and `jaccard_distance` Get the distance diff --git a/setup.py b/setup.py index dff986d..37e0f68 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.2.5', + version='0.3.0', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From a782ac6669a50a8b8ee2f89e29eff91f22367d3a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 27 Jun 2024 19:58:59 -0700 Subject: [PATCH 219/424] Added Morgan fingerprint example [skip ci] --- README.md | 1 + examples/morgan_fingerprints.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 examples/morgan_fingerprints.py diff --git a/README.md b/README.md index 670c6ea..cf60ef8 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Or check out some examples: - [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search.py) with Transformers - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing +- [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/morgan_fingerprints.py) with RDKit - [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit - [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise - [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM diff --git a/examples/morgan_fingerprints.py b/examples/morgan_fingerprints.py new file mode 100644 index 0000000..afb56ec --- /dev/null +++ b/examples/morgan_fingerprints.py @@ -0,0 +1,32 @@ +# good resource +# https://www.rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints + +from pgvector.psycopg import register_vector, Bit +import psycopg +from rdkit import Chem +from rdkit.Chem import AllChem + + +def generate_fingerprint(molecule): + fpgen = AllChem.GetMorganGenerator() + return fpgen.GetFingerprintAsNumPy(Chem.MolFromSmiles(molecule)) + + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS molecules') +conn.execute('CREATE TABLE molecules (id text PRIMARY KEY, fingerprint bit(2048))') + +molecules = ['Cc1ccccc1', 'Cc1ncccc1', 'c1ccccn1'] +for molecule in molecules: + fingerprint = generate_fingerprint(molecule) + conn.execute('INSERT INTO molecules (id, fingerprint) VALUES (%s, %s)', (molecule, Bit(fingerprint))) + +query_molecule = 'c1ccco1' +query_fingerprint = generate_fingerprint(query_molecule) +result = conn.execute('SELECT id, fingerprint <%%> %s AS distance FROM molecules ORDER BY distance LIMIT 5', (Bit(query_fingerprint),)).fetchall() +for row in result: + print(row) From 3bba751e2a7b729210a9da3235e26148347d6fff Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 27 Jun 2024 20:12:09 -0700 Subject: [PATCH 220/424] Updated image hash example [skip ci] --- examples/hash_image_search.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/hash_image_search.py b/examples/hash_image_search.py index d30027d..33fef0e 100644 --- a/examples/hash_image_search.py +++ b/examples/hash_image_search.py @@ -1,5 +1,6 @@ from datasets import load_dataset import matplotlib.pyplot as plt +from pgvector.psycopg import register_vector, Bit import psycopg from imagehash import phash @@ -10,6 +11,9 @@ def hash_image(img): conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + conn.execute('DROP TABLE IF EXISTS images') conn.execute('CREATE TABLE images (id bigserial PRIMARY KEY, hash bit(64))') @@ -23,13 +27,13 @@ def hash_image(img): cur = conn.cursor() with cur.copy('COPY images (hash) FROM STDIN') as copy: for image in images: - copy.write_row([image['hash']]) + copy.write_row([Bit(image['hash'])]) print('Querying hashes') results = [] for i in range(5): image = dataset['test'][i]['image'] - result = conn.execute('SELECT id FROM images ORDER BY bit_count(hash # %s) LIMIT 5', (hash_image(image),)).fetchall() + result = conn.execute('SELECT id FROM images ORDER BY hash <~> %s LIMIT 5', (hash_image(image),)).fetchall() nearest_images = [dataset['train'][row[0] - 1]['image'] for row in result] results.append([image] + nearest_images) From d36477ccc4796a5d2d4a7e91fd5114182c15787f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 28 Jun 2024 11:26:41 -0700 Subject: [PATCH 221/424] Added topic modeling example [skip ci] --- README.md | 1 + examples/requirements.txt | 1 + examples/topic_modeling.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 examples/topic_modeling.py diff --git a/README.md b/README.md index cf60ef8..d2de992 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ Or check out some examples: - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing - [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/morgan_fingerprints.py) with RDKit +- [Topic modeling](https://github.com/pgvector/pgvector-python/blob/master/examples/topic_modeling.py) with Gensim - [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit - [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise - [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM diff --git a/examples/requirements.txt b/examples/requirements.txt index 6832670..204e573 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,4 +1,5 @@ datasets +gensim imagehash implicit lightfm diff --git a/examples/topic_modeling.py b/examples/topic_modeling.py new file mode 100644 index 0000000..28d7fb2 --- /dev/null +++ b/examples/topic_modeling.py @@ -0,0 +1,34 @@ +from gensim.corpora.dictionary import Dictionary +from gensim.models import LdaModel +from gensim.utils import simple_preprocess +import numpy as np +from pgvector.psycopg import register_vector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(20))') + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] + +tokens = [simple_preprocess(content) for content in input] +dictionary = Dictionary(tokens) +corpus = [dictionary.doc2bow(token) for token in tokens] +model = LdaModel(corpus, num_topics=20) + +for content, doc in zip(input, corpus): + embedding = np.array([v[1] for v in model[doc]]) + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) + +document_id = 1 +neighbors = conn.execute('SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5', {'id': document_id}).fetchall() +for neighbor in neighbors: + print(neighbor[0]) From cbd8b9ee5bef83d7ed064c3389bd8d88a805923b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 28 Jun 2024 11:54:24 -0700 Subject: [PATCH 222/424] Fixed example [skip ci] --- examples/topic_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/topic_modeling.py b/examples/topic_modeling.py index 28d7fb2..d4b8650 100644 --- a/examples/topic_modeling.py +++ b/examples/topic_modeling.py @@ -25,7 +25,7 @@ model = LdaModel(corpus, num_topics=20) for content, doc in zip(input, corpus): - embedding = np.array([v[1] for v in model[doc]]) + embedding = np.array([v[1] for v in model.get_document_topics(doc, minimum_probability=0)]) conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) document_id = 1 From 88b808b7988b4372414ada22659f9709e0fd5ba6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 28 Jun 2024 12:02:20 -0700 Subject: [PATCH 223/424] Improved example [skip ci] --- examples/topic_modeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/topic_modeling.py b/examples/topic_modeling.py index d4b8650..6fa6b97 100644 --- a/examples/topic_modeling.py +++ b/examples/topic_modeling.py @@ -19,9 +19,9 @@ 'The bear is growling' ] -tokens = [simple_preprocess(content) for content in input] -dictionary = Dictionary(tokens) -corpus = [dictionary.doc2bow(token) for token in tokens] +docs = [simple_preprocess(content) for content in input] +dictionary = Dictionary(docs) +corpus = [dictionary.doc2bow(tokens) for tokens in docs] model = LdaModel(corpus, num_topics=20) for content, doc in zip(input, corpus): From 41692021c22fed6ebe62cbdad024f0e1e28c30d3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 28 Jun 2024 12:35:47 -0700 Subject: [PATCH 224/424] Improved naming [skip ci] --- examples/topic_modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/topic_modeling.py b/examples/topic_modeling.py index 6fa6b97..875ac88 100644 --- a/examples/topic_modeling.py +++ b/examples/topic_modeling.py @@ -24,8 +24,8 @@ corpus = [dictionary.doc2bow(tokens) for tokens in docs] model = LdaModel(corpus, num_topics=20) -for content, doc in zip(input, corpus): - embedding = np.array([v[1] for v in model.get_document_topics(doc, minimum_probability=0)]) +for content, bow in zip(input, corpus): + embedding = np.array([v[1] for v in model.get_document_topics(bow, minimum_probability=0)]) conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) document_id = 1 From 25074e04aa48fbe526762b0bca7dc3eb2d3dbf78 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 28 Jun 2024 12:39:50 -0700 Subject: [PATCH 225/424] Improved example [skip ci] --- examples/topic_modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/topic_modeling.py b/examples/topic_modeling.py index 875ac88..cfbf18d 100644 --- a/examples/topic_modeling.py +++ b/examples/topic_modeling.py @@ -21,6 +21,7 @@ docs = [simple_preprocess(content) for content in input] dictionary = Dictionary(docs) +dictionary.filter_extremes(no_below=1) corpus = [dictionary.doc2bow(tokens) for tokens in docs] model = LdaModel(corpus, num_topics=20) From a55d2c26b9406009376fd6713d6a691d4ffe2893 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 1 Jul 2024 11:26:19 -0700 Subject: [PATCH 226/424] Added test for COPY TO with Psycopg 3 --- tests/test_psycopg.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 79ac190..44588a8 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -118,25 +118,34 @@ def test_sparsevec_text_format(self): assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) - def test_text_copy(self): + def test_text_copy_from(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: copy.write_row([embedding, HalfVector(embedding), '101', SparseVector(embedding)]) - def test_binary_copy(self): + def test_binary_copy_from(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.write_row([embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) - def test_binary_copy_set_types(self): + def test_binary_copy_from_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() with cur.copy("COPY psycopg_items (id, embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.set_types(['int8', 'vector', 'halfvec', 'bit', 'sparsevec']) copy.write_row([1, embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) + def test_binary_copy_to_set_types(self): + embedding = np.array([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: + copy.set_types(['vector']) + for row in copy.rows(): + assert np.array_equal(row[0], embedding) + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) From ae23e0b01ca5a4111d57fc9d632dd8cfb2f114f2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 1 Jul 2024 11:33:23 -0700 Subject: [PATCH 227/424] Added more tests for COPY TO with Psycopg 3 --- tests/test_psycopg.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 44588a8..d08c7a4 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -137,6 +137,22 @@ def test_binary_copy_from_set_types(self): copy.set_types(['int8', 'vector', 'halfvec', 'bit', 'sparsevec']) copy.write_row([1, embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) + def test_text_copy_to(self): + embedding = np.array([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding) TO STDOUT") as copy: + for row in copy.rows(): + assert row[0] == "[1.5,2,3]" + + def test_binary_copy_to(self): + embedding = np.array([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: + for row in copy.rows(): + assert Vector.from_binary(row[0]).to_list() == [1.5, 2, 3] + def test_binary_copy_to_set_types(self): embedding = np.array([1.5, 2, 3]) conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) From 29b4ee330a019428ca515662c31738f9b28f9fdd Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 1 Jul 2024 11:37:29 -0700 Subject: [PATCH 228/424] Improved tests for COPY TO with Psycopg 3 [skip ci] --- tests/test_psycopg.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index d08c7a4..76d2840 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -139,28 +139,34 @@ def test_binary_copy_from_set_types(self): def test_text_copy_to(self): embedding = np.array([1.5, 2, 3]) - conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) + half_embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding, half_embedding) VALUES (%s, %s)', (embedding, half_embedding)) cur = conn.cursor() - with cur.copy("COPY psycopg_items (embedding) TO STDOUT") as copy: + with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT") as copy: for row in copy.rows(): assert row[0] == "[1.5,2,3]" + assert row[1] == "[1.5,2,3]" def test_binary_copy_to(self): embedding = np.array([1.5, 2, 3]) - conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) + half_embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding, half_embedding) VALUES (%s, %s)', (embedding, half_embedding)) cur = conn.cursor() - with cur.copy("COPY psycopg_items (embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: + with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: for row in copy.rows(): assert Vector.from_binary(row[0]).to_list() == [1.5, 2, 3] + assert HalfVector.from_binary(row[1]).to_list() == [1.5, 2, 3] def test_binary_copy_to_set_types(self): embedding = np.array([1.5, 2, 3]) - conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s)', (embedding,)) + half_embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding, half_embedding) VALUES (%s, %s)', (embedding, half_embedding)) cur = conn.cursor() - with cur.copy("COPY psycopg_items (embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: - copy.set_types(['vector']) + with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: + copy.set_types(['vector', 'halfvec']) for row in copy.rows(): assert np.array_equal(row[0], embedding) + assert row[1].to_list() == [1.5, 2, 3] @pytest.mark.asyncio async def test_async(self): From a07361bee7b248fe54c6b55263f6113c589e2ba3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 4 Jul 2024 08:52:02 -0700 Subject: [PATCH 229/424] Fixed NameError when vector type not found with Psycopg - fixes #77 --- pgvector/psycopg/vector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index 55e31a6..0f62ca9 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -1,3 +1,4 @@ +import psycopg from psycopg.adapt import Loader, Dumper from psycopg.pq import Format from ..utils import Vector From 2e253ce10169595686b0ac9b0e5c609f9297cf47 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 10 Jul 2024 00:17:12 -0700 Subject: [PATCH 230/424] Fixed error with Psycopg 2 and pgvector < 0.7 - fixes #79 --- CHANGELOG.md | 4 ++++ pgvector/psycopg2/register.py | 24 ++++++++++-------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 999a859..be87327 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.1 (unreleased) + +- Fixed error with Psycopg 2 and pgvector < 0.7 + ## 0.3.0 (2024-06-25) - Added support for `halfvec`, `bit`, and `sparsevec` types to Django diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 0ffd461..f299e86 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -7,20 +7,16 @@ def register_vector(conn_or_curs=None): cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs - try: - cur.execute('SELECT NULL::vector') - register_vector_info(cur.description[0][1]) - except psycopg2.errors.UndefinedObject: + cur.execute("SELECT typname, oid FROM pg_type WHERE typname IN ('vector', 'halfvec', 'sparsevec')") + type_info = dict(cur.fetchall()) + + if 'vector' not in type_info: raise psycopg2.ProgrammingError('vector type not found in the database') - try: - cur.execute('SELECT NULL::halfvec') - register_halfvec_info(cur.description[0][1]) - except psycopg2.errors.UndefinedObject: - pass + register_vector_info(type_info['vector']) + + if 'halfvec' in type_info: + register_halfvec_info(type_info['halfvec']) - try: - cur.execute('SELECT NULL::sparsevec') - register_sparsevec_info(cur.description[0][1]) - except psycopg2.errors.UndefinedObject: - pass + if 'sparsevec' in type_info: + register_sparsevec_info(type_info['sparsevec']) From 633cbd724380d445f47e405b801964c4b60fba6a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 10 Jul 2024 00:31:37 -0700 Subject: [PATCH 231/424] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index be87327..00714d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.3.1 (unreleased) - Fixed error with Psycopg 2 and pgvector < 0.7 +- Fixed error message when `vector` type not found with Psycopg 3 ## 0.3.0 (2024-06-25) From 5c65394fd080d47e9813288e951e75511543f703 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 10 Jul 2024 12:21:05 -0700 Subject: [PATCH 232/424] Fixed error parsing zero sparse vectors --- CHANGELOG.md | 1 + pgvector/utils/sparsevec.py | 10 ++++++---- tests/test_sparse_vector.py | 4 ++++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00714d8..972215d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.3.1 (unreleased) +- Fixed error parsing zero sparse vectors - Fixed error with Psycopg 2 and pgvector < 0.7 - Fixed error message when `vector` type not found with Psycopg 3 diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 1c81876..fd9ccff 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -95,10 +95,12 @@ def from_text(cls, value): elements, dim = value.split('/', 2) indices = [] values = [] - for e in elements[1:-1].split(','): - i, v = e.split(':', 2) - indices.append(int(i) - 1) - values.append(float(v)) + # split on empty string returns single element list + if len(elements) > 2: + for e in elements[1:-1].split(','): + i, v = e.split(':', 2) + indices.append(int(i) - 1) + values.append(float(v)) return cls._from_parts(int(dim), indices, values) @classmethod diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 643e841..06fe81a 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -63,3 +63,7 @@ def test_values(self): def test_to_coo(self): assert SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] + + def test_zero_vector_text(self): + vec = SparseVector({}, 3) + assert vec.to_list() == SparseVector.from_text(vec.to_text()).to_list() From fcd7f613ca24fbfa50f2e8919dfe3382216519c5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 10 Jul 2024 19:07:14 -0700 Subject: [PATCH 233/424] Fixed backwards compatibility of type info query for Psycopg 2 --- pgvector/psycopg2/register.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index f299e86..e587721 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -7,7 +7,8 @@ def register_vector(conn_or_curs=None): cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs - cur.execute("SELECT typname, oid FROM pg_type WHERE typname IN ('vector', 'halfvec', 'sparsevec')") + # use to_regtype to get first matching type in search path + cur.execute("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") type_info = dict(cur.fetchall()) if 'vector' not in type_info: From 13450b05a741d3258ce60cfd352e95dbe55892b8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 10 Jul 2024 20:33:25 -0700 Subject: [PATCH 234/424] Version bump to 0.3.1 [skip ci] --- CHANGELOG.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 972215d..1ffc98c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.1 (unreleased) +## 0.3.1 (2024-07-10) - Fixed error parsing zero sparse vectors - Fixed error with Psycopg 2 and pgvector < 0.7 diff --git a/setup.py b/setup.py index 37e0f68..cd7f909 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.3.0', + version='0.3.1', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From 182fe8491326d4e63b6bd5f376022a31fc6ecde5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 15 Jul 2024 17:04:13 -0700 Subject: [PATCH 235/424] Added Cohere example [skip ci] --- README.md | 1 + examples/cohere_embeddings.py | 34 ++++++++++++++++++++++++++++++++++ examples/requirements.txt | 1 + 3 files changed, 36 insertions(+) create mode 100644 examples/cohere_embeddings.py diff --git a/README.md b/README.md index d2de992..531973b 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ And follow the instructions for your database library: Or check out some examples: - [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai_embeddings.py) with OpenAI +- [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere_embeddings.py) with Cohere - [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_embeddings.py) with SentenceTransformers - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) diff --git a/examples/cohere_embeddings.py b/examples/cohere_embeddings.py new file mode 100644 index 0000000..780352a --- /dev/null +++ b/examples/cohere_embeddings.py @@ -0,0 +1,34 @@ +import cohere +import numpy as np +from pgvector.psycopg import register_vector, Bit +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1024))') + + +def fetch_embeddings(input, input_type): + co = cohere.Client() + response = co.embed(texts=input, model='embed-english-v3.0', input_type=input_type, embedding_types=['ubinary']) + return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] + + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = fetch_embeddings(input, 'search_document') +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, Bit(embedding))) + +query = 'forest' +query_embedding = fetch_embeddings([query], 'search_query')[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <~> %s LIMIT 5', (Bit(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/requirements.txt b/examples/requirements.txt index 204e573..8b03fab 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,3 +1,4 @@ +cohere datasets gensim imagehash From 9c98a2d9730424e0705c97f4e684a65bbb7fedd5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 15 Jul 2024 17:15:39 -0700 Subject: [PATCH 236/424] Added todo [skip ci] --- pgvector/utils/bit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 8a6f2ac..fabac73 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -7,6 +7,7 @@ def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value else: + # TODO use np.unpackbits for uint8 array value = np.asarray(value, dtype=bool) if value.ndim != 1: From b87552b004bc0d665ebc2f93801cad9ce02c853b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 16 Jul 2024 17:45:33 -0700 Subject: [PATCH 237/424] Improved example [skip ci] --- examples/bulk_loading.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/bulk_loading.py b/examples/bulk_loading.py index 1df8fe7..ba058c4 100644 --- a/examples/bulk_loading.py +++ b/examples/bulk_loading.py @@ -38,7 +38,8 @@ print('\nSuccess!') # create any indexes *after* loading initial data (skipping for this example) -if False: +create_index = False +if create_index: print('Creating index') conn.execute("SET maintenance_work_mem = '8GB'") conn.execute('SET max_parallel_maintenance_workers = 7') From 4672a4b8a83b12c5abc5775839386928d94c6788 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Jul 2024 10:37:37 -0700 Subject: [PATCH 238/424] Fixed error with asyncpg and pgvector < 0.7 - fixes #83 --- CHANGELOG.md | 4 ++++ pgvector/asyncpg/register.py | 28 ++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ffc98c..3b13d08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.2 (unreleased) + +- Fixed error with asyncpg and pgvector < 0.7 + ## 0.3.1 (2024-07-10) - Fixed error parsing zero sparse vectors diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py index 7148ffa..ad75c2d 100644 --- a/pgvector/asyncpg/register.py +++ b/pgvector/asyncpg/register.py @@ -9,16 +9,20 @@ async def register_vector(conn): format='binary' ) - await conn.set_type_codec( - 'halfvec', - encoder=HalfVector._to_db_binary, - decoder=HalfVector._from_db_binary, - format='binary' - ) + try: + await conn.set_type_codec( + 'halfvec', + encoder=HalfVector._to_db_binary, + decoder=HalfVector._from_db_binary, + format='binary' + ) - await conn.set_type_codec( - 'sparsevec', - encoder=SparseVector._to_db_binary, - decoder=SparseVector._from_db_binary, - format='binary' - ) + await conn.set_type_codec( + 'sparsevec', + encoder=SparseVector._to_db_binary, + decoder=SparseVector._from_db_binary, + format='binary' + ) + except ValueError as e: + if not str(e).startswith('unknown type:'): + raise e From e77ee138d318b15c6276e0117a5905cc77e78527 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Jul 2024 13:15:18 -0700 Subject: [PATCH 239/424] Updated publish task [skip ci] --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2199d93..c64f942 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ lint: publish: clean python3 setup.py bdist_wheel --universal ls dist - # twine upload dist/* + twine upload dist/* make clean clean: From 4498caaece8b72e4d4c2f67d400fbc6eb14a0ae4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Jul 2024 13:32:03 -0700 Subject: [PATCH 240/424] Added test for Bit constructor with uint8 --- pgvector/utils/bit.py | 7 ++++++- tests/test_bit.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index fabac73..f2de6d7 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -7,7 +7,12 @@ def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value else: - # TODO use np.unpackbits for uint8 array + # TODO change in 0.4.0 + # if isinstance(value, np.ndarray) and value.dtype == np.uint8: + # value = np.unpackbits(value) + # else: + # value = np.asarray(value, dtype=bool) + value = np.asarray(value, dtype=bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index c42bd05..32ab87b 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -13,6 +13,12 @@ def test_tuple(self): def test_str(self): assert Bit('101').to_list() == [True, False, True] + def test_ndarray_uint8(self): + arr = np.array([254, 7, 0], dtype=np.uint8) + # TODO change in 0.4.0 + # assert Bit(arr).to_text() == '111111100000011100000000' + assert Bit(arr).to_text() == '110' + def test_ndarray_same_object(self): arr = np.array([True, False, True]) assert Bit(arr).to_list() == [True, False, True] From 50fbcabe46ac0a503a5fb74d19fad838f75ae5e0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Jul 2024 13:33:59 -0700 Subject: [PATCH 241/424] Added todo [skip ci] --- pgvector/utils/bit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index f2de6d7..51f7556 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -8,6 +8,7 @@ def __init__(self, value): self._value = self.from_text(value)._value else: # TODO change in 0.4.0 + # TODO raise if dtype not bool or uint8 # if isinstance(value, np.ndarray) and value.dtype == np.uint8: # value = np.unpackbits(value) # else: From 4f721eb96c4c26b85479536c3c6df1076ffd9dbe Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Jul 2024 13:52:12 -0700 Subject: [PATCH 242/424] Version bump to 0.3.2 [skip ci] --- CHANGELOG.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b13d08..e47f34f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.2 (unreleased) +## 0.3.2 (2024-07-17) - Fixed error with asyncpg and pgvector < 0.7 diff --git a/setup.py b/setup.py index cd7f909..db3df5c 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.3.1', + version='0.3.2', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From 2fc3ccdc1567b389f94c81be262ca282d87e5bd1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 2 Aug 2024 22:46:06 -0700 Subject: [PATCH 243/424] Updated readme [skip ci] --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 531973b..b75cca0 100644 --- a/README.md +++ b/README.md @@ -530,3 +530,11 @@ pip install -r requirements.txt createdb pgvector_python_test pytest ``` + +To run an example: + +```sh +cd examples +pip install -r requirements.txt +python3 bulk_loading.py +``` From 93780f9f5952730e0c689b5bfc82d2dd93401e7d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 2 Aug 2024 22:49:42 -0700 Subject: [PATCH 244/424] Updated readme [skip ci] --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b75cca0..2b8253b 100644 --- a/README.md +++ b/README.md @@ -536,5 +536,6 @@ To run an example: ```sh cd examples pip install -r requirements.txt +createdb pgvector_example python3 bulk_loading.py ``` From 3e32cef2688ced358d4d1f9c4bb9c8211d15998e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 12 Aug 2024 12:31:55 -0700 Subject: [PATCH 245/424] Added logging for Django tests [skip ci] --- tests/test_django.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test_django.py b/tests/test_django.py index 421966f..79837a3 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -7,6 +7,7 @@ from django.forms import ModelForm from math import sqrt import numpy as np +import os import pgvector.django from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVector, SparseVector from unittest import mock @@ -17,7 +18,25 @@ 'ENGINE': 'django.db.backends.postgresql', 'NAME': 'pgvector_python_test', } - } + }, + LOGGING={ + 'version': 1, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler' + } + }, + 'loggers': { + 'django.db.backends': { + 'handlers': ['console'], + 'level': 'DEBUG' + }, + 'django.db.backends.schema': { + 'level': 'WARNING' + } + } + }, + DEBUG=('VERBOSE' in os.environ) ) django.setup() From 04c39510910ec0b037e33817b6f54846fa1b4624 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 12 Aug 2024 12:32:46 -0700 Subject: [PATCH 246/424] Improved code [skip ci] --- tests/test_django.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 79837a3..06d86a4 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -19,6 +19,7 @@ 'NAME': 'pgvector_python_test', } }, + DEBUG=('VERBOSE' in os.environ), LOGGING={ 'version': 1, 'handlers': { @@ -35,8 +36,7 @@ 'level': 'WARNING' } } - }, - DEBUG=('VERBOSE' in os.environ) + } ) django.setup() From 7fbb252788b64bf2cfa0d8bd4b59e7cd7a5a61c2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 12 Aug 2024 15:33:03 -0700 Subject: [PATCH 247/424] Added test for vector[] with Psycopg 3 [skip ci] --- tests/test_psycopg.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 76d2840..c4e1c22 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -7,7 +7,7 @@ conn.execute('CREATE EXTENSION IF NOT EXISTS vector') conn.execute('DROP TABLE IF EXISTS psycopg_items') -conn.execute('CREATE TABLE psycopg_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') +conn.execute('CREATE TABLE psycopg_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[])') register_vector(conn) @@ -168,6 +168,14 @@ def test_binary_copy_to_set_types(self): assert np.array_equal(row[0], embedding) assert row[1].to_list() == [1.5, 2, 3] + def test_vector_array(self): + embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + conn.execute('INSERT INTO psycopg_items (embeddings) VALUES (%s)', (embeddings,)) + + res = conn.execute('SELECT embeddings FROM psycopg_items ORDER BY id').fetchone() + assert np.array_equal(res[0][0], embeddings[0]) + assert np.array_equal(res[0][1], embeddings[1]) + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) From 94c8c5494801ec3b254bb5bc7d40dde426a884d4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 12 Aug 2024 23:39:06 -0700 Subject: [PATCH 248/424] Added ColBERT example [skip ci] --- README.md | 1 + examples/colbert_exact.py | 48 +++++++++++++++++++++++++++++++++++++++ examples/requirements.txt | 1 + 3 files changed, 50 insertions(+) create mode 100644 examples/colbert_exact.py diff --git a/README.md b/README.md index 2b8253b..3d197ee 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Or check out some examples: - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) - [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search.py) with Transformers +- [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert_exact.py) with ColBERT - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing - [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/morgan_fingerprints.py) with RDKit diff --git a/examples/colbert_exact.py b/examples/colbert_exact.py new file mode 100644 index 0000000..f953824 --- /dev/null +++ b/examples/colbert_exact.py @@ -0,0 +1,48 @@ +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +import numpy as np +from pgvector.psycopg import register_vector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embeddings vector(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=ColBERTConfig()) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input) +for content, embeddings in zip(input, doc_embeddings): + embeddings = [e.numpy() for e in embeddings if e.count_nonzero() > 0] + conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) + +query = 'puppy' +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query], bsize=1)[0]] +result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) diff --git a/examples/requirements.txt b/examples/requirements.txt index 8b03fab..45d8f17 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,4 +1,5 @@ cohere +colbert-ai datasets gensim imagehash From 02ee9161d96ca410b45a7374e202de7e73590127 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 13 Aug 2024 00:13:05 -0700 Subject: [PATCH 249/424] Improved example [skip ci] --- examples/colbert_exact.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/colbert_exact.py b/examples/colbert_exact.py index f953824..dd6802d 100644 --- a/examples/colbert_exact.py +++ b/examples/colbert_exact.py @@ -1,6 +1,5 @@ from colbert.infra import ColBERTConfig from colbert.modeling.checkpoint import Checkpoint -import numpy as np from pgvector.psycopg import register_vector import psycopg @@ -29,7 +28,7 @@ $$ LANGUAGE SQL """) -checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=ColBERTConfig()) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=ColBERTConfig(), verbose=0) input = [ 'The dog is barking', @@ -42,7 +41,7 @@ conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) query = 'puppy' -query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query], bsize=1)[0]] +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query])[0]] result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() for row in result: print(row) From 1d98ca2b1eff47bd81ec5129859f2a4c750eed54 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 13 Aug 2024 00:14:46 -0700 Subject: [PATCH 250/424] Improved example [skip ci] --- examples/colbert_exact.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/colbert_exact.py b/examples/colbert_exact.py index dd6802d..672de4c 100644 --- a/examples/colbert_exact.py +++ b/examples/colbert_exact.py @@ -35,9 +35,9 @@ 'The cat is purring', 'The bear is growling' ] -doc_embeddings = checkpoint.docFromText(input) +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) for content, embeddings in zip(input, doc_embeddings): - embeddings = [e.numpy() for e in embeddings if e.count_nonzero() > 0] + embeddings = [e.numpy() for e in embeddings] conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) query = 'puppy' From 780c1966be1ff4c205c026bd9dfa8255f7e4f5e5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 13 Aug 2024 12:29:07 -0700 Subject: [PATCH 251/424] Improved example [skip ci] --- examples/colbert_exact.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/colbert_exact.py b/examples/colbert_exact.py index 672de4c..06e4e76 100644 --- a/examples/colbert_exact.py +++ b/examples/colbert_exact.py @@ -28,7 +28,8 @@ $$ LANGUAGE SQL """) -checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=ColBERTConfig(), verbose=0) +config = ColBERTConfig(query_maxlen=5) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) input = [ 'The dog is barking', From 472bdabde3f6a6b34a929063500ea4311f835b33 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 13 Aug 2024 12:33:17 -0700 Subject: [PATCH 252/424] Improved example [skip ci] --- examples/colbert_exact.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/colbert_exact.py b/examples/colbert_exact.py index 06e4e76..1c90b47 100644 --- a/examples/colbert_exact.py +++ b/examples/colbert_exact.py @@ -28,7 +28,7 @@ $$ LANGUAGE SQL """) -config = ColBERTConfig(query_maxlen=5) +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) input = [ From a8a1bf106c079d330e1af3d9bbfb02df84fa6de6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 25 Aug 2024 12:38:22 -0700 Subject: [PATCH 253/424] Added tests for Psycopg 2 cursor factories --- tests/test_psycopg2.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 54da6a7..e6ba996 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,6 +1,7 @@ import numpy as np from pgvector.psycopg2 import register_vector, SparseVector import psycopg2 +from psycopg2.extras import DictCursor, NamedTupleCursor conn = psycopg2.connect(dbname='pgvector_python_test') conn.autocommit = True @@ -53,3 +54,16 @@ def test_sparsevec(self): res = cur.fetchall() assert res[0][0].to_list() == [1.5, 2, 3] assert res[1][0] is None + + def test_cursor_factory(self): + for cursor_factory in [DictCursor, NamedTupleCursor]: + conn = psycopg2.connect(dbname='pgvector_python_test') + cur = conn.cursor(cursor_factory=cursor_factory) + register_vector(cur) + conn.close() + + def test_cursor_factory_connection(self): + for cursor_factory in [DictCursor, NamedTupleCursor]: + conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) + register_vector(conn) + conn.close() From d0a3c5a8c7fac6e03cd6f8bac9319e3e6d0746e8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 25 Aug 2024 12:43:49 -0700 Subject: [PATCH 254/424] Improved support for cursor factories with Psycopg 2 - closes #89 --- CHANGELOG.md | 4 ++++ pgvector/psycopg2/register.py | 4 +++- tests/test_psycopg2.py | 6 +++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e47f34f..6a856c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.3 (unreleased) + +- Improved support for cursor factories with Psycopg 2 + ## 0.3.2 (2024-07-17) - Fixed error with asyncpg and pgvector < 0.7 diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index e587721..60a7aa7 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -1,11 +1,13 @@ import psycopg2 +from psycopg2.extensions import cursor from .halfvec import register_halfvec_info from .sparsevec import register_sparsevec_info from .vector import register_vector_info def register_vector(conn_or_curs=None): - cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs + conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection + cur = conn.cursor(cursor_factory=cursor) # use to_regtype to get first matching type in search path cur.execute("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index e6ba996..24e99d6 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.psycopg2 import register_vector, SparseVector import psycopg2 -from psycopg2.extras import DictCursor, NamedTupleCursor +from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor conn = psycopg2.connect(dbname='pgvector_python_test') conn.autocommit = True @@ -56,14 +56,14 @@ def test_sparsevec(self): assert res[1][0] is None def test_cursor_factory(self): - for cursor_factory in [DictCursor, NamedTupleCursor]: + for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test') cur = conn.cursor(cursor_factory=cursor_factory) register_vector(cur) conn.close() def test_cursor_factory_connection(self): - for cursor_factory in [DictCursor, NamedTupleCursor]: + for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) register_vector(conn) conn.close() From 7e7a851d413d07cd7701ba7c411ed3093b531934 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 28 Aug 2024 13:35:47 -0700 Subject: [PATCH 255/424] Moved examples to separate directories [skip ci] --- README.md | 36 +++++++++---------- examples/{citus.py => citus/example.py} | 0 examples/citus/requirements.txt | 3 ++ .../example.py} | 0 examples/cohere/requirements.txt | 3 ++ .../{colbert_exact.py => colbert/exact.py} | 0 examples/colbert/requirements.txt | 3 ++ .../{topic_modeling.py => gensim/example.py} | 0 examples/gensim/requirements.txt | 5 +++ .../cross_encoder.py} | 0 examples/hybrid_search/requirements.txt | 3 ++ .../rrf.py} | 0 .../example.py} | 0 examples/image_search/requirements.txt | 6 ++++ .../example.py} | 2 +- examples/imagehash/requirements.txt | 5 +++ .../{implicit_recs.py => implicit/example.py} | 0 examples/implicit/requirements.txt | 4 +++ .../{lightfm_recs.py => lightfm/example.py} | 0 examples/lightfm/requirements.txt | 4 +++ .../{bulk_loading.py => loading/example.py} | 0 examples/loading/requirements.txt | 3 ++ .../example.py} | 0 examples/openai/requirements.txt | 3 ++ .../example.py} | 0 examples/rdkit/requirements.txt | 3 ++ examples/requirements.txt | 13 ------- .../example.py} | 0 .../sentence_transformers/requirements.txt | 3 ++ .../example.py} | 0 examples/sparse_search/requirements.txt | 5 +++ .../{surprise_recs.py => surprise/example.py} | 0 examples/surprise/requirements.txt | 4 +++ 33 files changed, 76 insertions(+), 32 deletions(-) rename examples/{citus.py => citus/example.py} (100%) create mode 100644 examples/citus/requirements.txt rename examples/{cohere_embeddings.py => cohere/example.py} (100%) create mode 100644 examples/cohere/requirements.txt rename examples/{colbert_exact.py => colbert/exact.py} (100%) create mode 100644 examples/colbert/requirements.txt rename examples/{topic_modeling.py => gensim/example.py} (100%) create mode 100644 examples/gensim/requirements.txt rename examples/{hybrid_search.py => hybrid_search/cross_encoder.py} (100%) create mode 100644 examples/hybrid_search/requirements.txt rename examples/{hybrid_search_rrf.py => hybrid_search/rrf.py} (100%) rename examples/{pytorch_image_search.py => image_search/example.py} (100%) create mode 100644 examples/image_search/requirements.txt rename examples/{hash_image_search.py => imagehash/example.py} (100%) create mode 100644 examples/imagehash/requirements.txt rename examples/{implicit_recs.py => implicit/example.py} (100%) create mode 100644 examples/implicit/requirements.txt rename examples/{lightfm_recs.py => lightfm/example.py} (100%) create mode 100644 examples/lightfm/requirements.txt rename examples/{bulk_loading.py => loading/example.py} (100%) create mode 100644 examples/loading/requirements.txt rename examples/{openai_embeddings.py => openai/example.py} (100%) create mode 100644 examples/openai/requirements.txt rename examples/{morgan_fingerprints.py => rdkit/example.py} (100%) create mode 100644 examples/rdkit/requirements.txt delete mode 100644 examples/requirements.txt rename examples/{sentence_embeddings.py => sentence_transformers/example.py} (100%) create mode 100644 examples/sentence_transformers/requirements.txt rename examples/{sparse_search.py => sparse_search/example.py} (100%) create mode 100644 examples/sparse_search/requirements.txt rename examples/{surprise_recs.py => surprise/example.py} (100%) create mode 100644 examples/surprise/requirements.txt diff --git a/README.md b/README.md index 3d197ee..f68fda4 100644 --- a/README.md +++ b/README.md @@ -26,22 +26,22 @@ And follow the instructions for your database library: Or check out some examples: -- [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai_embeddings.py) with OpenAI -- [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere_embeddings.py) with Cohere -- [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_embeddings.py) with SentenceTransformers -- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search_rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) -- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py) with SentenceTransformers (cross-encoder) -- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search.py) with Transformers -- [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert_exact.py) with ColBERT -- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/pytorch_image_search.py) with PyTorch -- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/hash_image_search.py) with perceptual hashing -- [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/morgan_fingerprints.py) with RDKit -- [Topic modeling](https://github.com/pgvector/pgvector-python/blob/master/examples/topic_modeling.py) with Gensim -- [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit_recs.py) with Implicit -- [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise_recs.py) with Surprise -- [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm_recs.py) with LightFM -- [Horizontal scaling](https://github.com/pgvector/pgvector-python/blob/master/examples/citus.py) with Citus -- [Bulk loading](https://github.com/pgvector/pgvector-python/blob/master/examples/bulk_loading.py) with `COPY` +- [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai/example.py) with OpenAI +- [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere/example.py) with Cohere +- [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_transformers/example.py) with SentenceTransformers +- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) +- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder) +- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers +- [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT +- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch +- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing +- [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit +- [Topic modeling](https://github.com/pgvector/pgvector-python/blob/master/examples/gensim/example.py) with Gensim +- [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit/example.py) with Implicit +- [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise/example.py) with Surprise +- [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm/example.py) with LightFM +- [Horizontal scaling](https://github.com/pgvector/pgvector-python/blob/master/examples/citus/example.py) with Citus +- [Bulk loading](https://github.com/pgvector/pgvector-python/blob/master/examples/loading/example.py) with `COPY` ## Django @@ -535,8 +535,8 @@ pytest To run an example: ```sh -cd examples +cd examples/loading pip install -r requirements.txt createdb pgvector_example -python3 bulk_loading.py +python3 example.py ``` diff --git a/examples/citus.py b/examples/citus/example.py similarity index 100% rename from examples/citus.py rename to examples/citus/example.py diff --git a/examples/citus/requirements.txt b/examples/citus/requirements.txt new file mode 100644 index 0000000..1cf8ee9 --- /dev/null +++ b/examples/citus/requirements.txt @@ -0,0 +1,3 @@ +numpy +pgvector +psycopg[binary] diff --git a/examples/cohere_embeddings.py b/examples/cohere/example.py similarity index 100% rename from examples/cohere_embeddings.py rename to examples/cohere/example.py diff --git a/examples/cohere/requirements.txt b/examples/cohere/requirements.txt new file mode 100644 index 0000000..22fd056 --- /dev/null +++ b/examples/cohere/requirements.txt @@ -0,0 +1,3 @@ +cohere +pgvector +psycopg[binary] diff --git a/examples/colbert_exact.py b/examples/colbert/exact.py similarity index 100% rename from examples/colbert_exact.py rename to examples/colbert/exact.py diff --git a/examples/colbert/requirements.txt b/examples/colbert/requirements.txt new file mode 100644 index 0000000..4402ce8 --- /dev/null +++ b/examples/colbert/requirements.txt @@ -0,0 +1,3 @@ +colbert-ai +pgvector +psycopg[binary] diff --git a/examples/topic_modeling.py b/examples/gensim/example.py similarity index 100% rename from examples/topic_modeling.py rename to examples/gensim/example.py diff --git a/examples/gensim/requirements.txt b/examples/gensim/requirements.txt new file mode 100644 index 0000000..15411cd --- /dev/null +++ b/examples/gensim/requirements.txt @@ -0,0 +1,5 @@ +gensim +numpy +pgvector +psycopg[binary] +scipy<1.13 diff --git a/examples/hybrid_search.py b/examples/hybrid_search/cross_encoder.py similarity index 100% rename from examples/hybrid_search.py rename to examples/hybrid_search/cross_encoder.py diff --git a/examples/hybrid_search/requirements.txt b/examples/hybrid_search/requirements.txt new file mode 100644 index 0000000..237dcd1 --- /dev/null +++ b/examples/hybrid_search/requirements.txt @@ -0,0 +1,3 @@ +pgvector +psycopg[binary] +sentence-transformers diff --git a/examples/hybrid_search_rrf.py b/examples/hybrid_search/rrf.py similarity index 100% rename from examples/hybrid_search_rrf.py rename to examples/hybrid_search/rrf.py diff --git a/examples/pytorch_image_search.py b/examples/image_search/example.py similarity index 100% rename from examples/pytorch_image_search.py rename to examples/image_search/example.py diff --git a/examples/image_search/requirements.txt b/examples/image_search/requirements.txt new file mode 100644 index 0000000..3d82365 --- /dev/null +++ b/examples/image_search/requirements.txt @@ -0,0 +1,6 @@ +matplotlib +pgvector +psycopg[binary] +torch +torchvision +tqdm diff --git a/examples/hash_image_search.py b/examples/imagehash/example.py similarity index 100% rename from examples/hash_image_search.py rename to examples/imagehash/example.py index 33fef0e..f49af40 100644 --- a/examples/hash_image_search.py +++ b/examples/imagehash/example.py @@ -1,8 +1,8 @@ from datasets import load_dataset +from imagehash import phash import matplotlib.pyplot as plt from pgvector.psycopg import register_vector, Bit import psycopg -from imagehash import phash def hash_image(img): diff --git a/examples/imagehash/requirements.txt b/examples/imagehash/requirements.txt new file mode 100644 index 0000000..e3971e6 --- /dev/null +++ b/examples/imagehash/requirements.txt @@ -0,0 +1,5 @@ +datasets +imagehash +matplotlib +pgvector +psycopg[binary] diff --git a/examples/implicit_recs.py b/examples/implicit/example.py similarity index 100% rename from examples/implicit_recs.py rename to examples/implicit/example.py diff --git a/examples/implicit/requirements.txt b/examples/implicit/requirements.txt new file mode 100644 index 0000000..8f04b58 --- /dev/null +++ b/examples/implicit/requirements.txt @@ -0,0 +1,4 @@ +implicit +pgvector +psycopg[binary] +SQLAlchemy diff --git a/examples/lightfm_recs.py b/examples/lightfm/example.py similarity index 100% rename from examples/lightfm_recs.py rename to examples/lightfm/example.py diff --git a/examples/lightfm/requirements.txt b/examples/lightfm/requirements.txt new file mode 100644 index 0000000..cfa5f51 --- /dev/null +++ b/examples/lightfm/requirements.txt @@ -0,0 +1,4 @@ +lightfm +pgvector +psycopg[binary] +SQLAlchemy diff --git a/examples/bulk_loading.py b/examples/loading/example.py similarity index 100% rename from examples/bulk_loading.py rename to examples/loading/example.py diff --git a/examples/loading/requirements.txt b/examples/loading/requirements.txt new file mode 100644 index 0000000..1cf8ee9 --- /dev/null +++ b/examples/loading/requirements.txt @@ -0,0 +1,3 @@ +numpy +pgvector +psycopg[binary] diff --git a/examples/openai_embeddings.py b/examples/openai/example.py similarity index 100% rename from examples/openai_embeddings.py rename to examples/openai/example.py diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt new file mode 100644 index 0000000..18587e2 --- /dev/null +++ b/examples/openai/requirements.txt @@ -0,0 +1,3 @@ +openai +pgvector +psycopg[binary] diff --git a/examples/morgan_fingerprints.py b/examples/rdkit/example.py similarity index 100% rename from examples/morgan_fingerprints.py rename to examples/rdkit/example.py diff --git a/examples/rdkit/requirements.txt b/examples/rdkit/requirements.txt new file mode 100644 index 0000000..85a3e4f --- /dev/null +++ b/examples/rdkit/requirements.txt @@ -0,0 +1,3 @@ +pgvector +psycopg[binary] +rdkit diff --git a/examples/requirements.txt b/examples/requirements.txt deleted file mode 100644 index 45d8f17..0000000 --- a/examples/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -cohere -colbert-ai -datasets -gensim -imagehash -implicit -lightfm -matplotlib -openai -torch -torchvision -scikit-surprise -sentence-transformers diff --git a/examples/sentence_embeddings.py b/examples/sentence_transformers/example.py similarity index 100% rename from examples/sentence_embeddings.py rename to examples/sentence_transformers/example.py diff --git a/examples/sentence_transformers/requirements.txt b/examples/sentence_transformers/requirements.txt new file mode 100644 index 0000000..237dcd1 --- /dev/null +++ b/examples/sentence_transformers/requirements.txt @@ -0,0 +1,3 @@ +pgvector +psycopg[binary] +sentence-transformers diff --git a/examples/sparse_search.py b/examples/sparse_search/example.py similarity index 100% rename from examples/sparse_search.py rename to examples/sparse_search/example.py diff --git a/examples/sparse_search/requirements.txt b/examples/sparse_search/requirements.txt new file mode 100644 index 0000000..3de81c7 --- /dev/null +++ b/examples/sparse_search/requirements.txt @@ -0,0 +1,5 @@ +numpy +pgvector +psycopg[binary] +torch +transformers diff --git a/examples/surprise_recs.py b/examples/surprise/example.py similarity index 100% rename from examples/surprise_recs.py rename to examples/surprise/example.py diff --git a/examples/surprise/requirements.txt b/examples/surprise/requirements.txt new file mode 100644 index 0000000..cb2dca4 --- /dev/null +++ b/examples/surprise/requirements.txt @@ -0,0 +1,4 @@ +pgvector +psycopg[binary] +scikit-surprise +SQLAlchemy From c431eb658cb24a0545b5d106d83c3c8029b981fb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 9 Sep 2024 16:37:31 -0700 Subject: [PATCH 256/424] Version bump to 0.3.3 [skip ci] --- CHANGELOG.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a856c8..0bace8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.3 (unreleased) +## 0.3.3 (2024-09-09) - Improved support for cursor factories with Psycopg 2 diff --git a/setup.py b/setup.py index db3df5c..ae636c6 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.3.2', + version='0.3.3', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From 3461181c17bb5814fbe80d9cd1f486ed543810e0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 20 Sep 2024 19:20:00 -0700 Subject: [PATCH 257/424] Added schema option for asyncpg - closes #90 --- CHANGELOG.md | 4 ++++ pgvector/asyncpg/register.py | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bace8e..bff6664 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.4 (unreleased) + +- Added `schema` option for asyncpg + ## 0.3.3 (2024-09-09) - Improved support for cursor factories with Psycopg 2 diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py index ad75c2d..a388058 100644 --- a/pgvector/asyncpg/register.py +++ b/pgvector/asyncpg/register.py @@ -1,9 +1,10 @@ from ..utils import Vector, HalfVector, SparseVector -async def register_vector(conn): +async def register_vector(conn, schema='public'): await conn.set_type_codec( 'vector', + schema=schema, encoder=Vector._to_db_binary, decoder=Vector._from_db_binary, format='binary' @@ -12,6 +13,7 @@ async def register_vector(conn): try: await conn.set_type_codec( 'halfvec', + schema=schema, encoder=HalfVector._to_db_binary, decoder=HalfVector._from_db_binary, format='binary' @@ -19,6 +21,7 @@ async def register_vector(conn): await conn.set_type_codec( 'sparsevec', + schema=schema, encoder=SparseVector._to_db_binary, decoder=SparseVector._from_db_binary, format='binary' From e3f7bf029b6549bf10482a6dd511f5ecb5b33a01 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 22 Sep 2024 10:51:53 -0700 Subject: [PATCH 258/424] Updated example [skip ci] --- examples/loading/example.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/loading/example.py b/examples/loading/example.py index ba058c4..0702129 100644 --- a/examples/loading/example.py +++ b/examples/loading/example.py @@ -31,10 +31,6 @@ copy.write_row([embedding]) - # flush data - while conn.pgconn.flush() == 1: - pass - print('\nSuccess!') # create any indexes *after* loading initial data (skipping for this example) From 6b648ee8a3d6ebfb8013024adfe59231881ce9b1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 26 Sep 2024 20:00:38 -0700 Subject: [PATCH 259/424] Version bump to 0.3.4 [skip ci] --- CHANGELOG.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bff6664..51ea394 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.4 (unreleased) +## 0.3.4 (2024-09-26) - Added `schema` option for asyncpg diff --git a/setup.py b/setup.py index ae636c6..0cebd22 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='pgvector', - version='0.3.3', + version='0.3.4', description='pgvector support for Python', long_description=long_description, long_description_content_type='text/markdown', From f0db3a412797c88246077a1a66b35b5fdecfe0da Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 1 Oct 2024 20:30:58 -0700 Subject: [PATCH 260/424] Switched to pyproject.toml --- Makefile | 2 +- pyproject.toml | 23 +++++++++++++++++++++++ pytest.ini | 2 -- setup.py | 30 ------------------------------ 4 files changed, 24 insertions(+), 33 deletions(-) create mode 100644 pyproject.toml delete mode 100644 pytest.ini delete mode 100644 setup.py diff --git a/Makefile b/Makefile index c64f942..e790e44 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ lint: pycodestyle . --ignore=E501 publish: clean - python3 setup.py bdist_wheel --universal + python3 -m build ls dist twine upload dist/* make clean diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..226c0a4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "pgvector" +version = "0.3.4" +description = "pgvector support for Python" +readme = "README.md" +authors = [ + {name = "Andrew Kane", email = "andrew@ankane.org"} +] +license = {text = "MIT"} +requires-python = ">= 3.8" +dependencies = [ + "numpy" +] + +[project.urls] +Homepage = "https://github.com/pgvector/pgvector-python" + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 2f4c80e..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -asyncio_mode = auto diff --git a/setup.py b/setup.py deleted file mode 100644 index 0cebd22..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -from setuptools import setup - -with open('README.md', 'r', encoding='utf-8') as fh: - long_description = fh.read() - -setup( - name='pgvector', - version='0.3.4', - description='pgvector support for Python', - long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/pgvector/pgvector-python', - author='Andrew Kane', - author_email='andrew@ankane.org', - license='MIT', - packages=[ - 'pgvector.asyncpg', - 'pgvector.django', - 'pgvector.peewee', - 'pgvector.psycopg', - 'pgvector.psycopg2', - 'pgvector.sqlalchemy', - 'pgvector.utils' - ], - python_requires='>=3.8', - install_requires=[ - 'numpy' - ], - zip_safe=False -) From 4483f03047d9c4095a0de75d16c402fad5e00dfc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 1 Oct 2024 20:33:50 -0700 Subject: [PATCH 261/424] Removed top-level __init__ to be consistent with previous packaging --- pgvector/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 pgvector/__init__.py diff --git a/pgvector/__init__.py b/pgvector/__init__.py deleted file mode 100644 index e69de29..0000000 From e6e43f563b4e9361d3afa553ef1f8169941fdaa7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 00:17:17 -0700 Subject: [PATCH 262/424] Improved Makefile [skip ci] --- Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index e790e44..f0831c4 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ +.PHONY: lint build publish clean + lint: pycodestyle . --ignore=E501 -publish: clean +build: python3 -m build - ls dist + +publish: clean build twine upload dist/* - make clean clean: - rm -rf .pytest_cache build dist pgvector.egg-info + rm -rf .pytest_cache dist pgvector.egg-info From 9018d36200cc04ba931766acb068623cd8cb77d8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 10:16:51 -0700 Subject: [PATCH 263/424] Fixed deprecation warning with tests [skip ci] --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 226c0a4..25fcacc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,3 +21,4 @@ Homepage = "https://github.com/pgvector/pgvector-python" [tool.pytest.ini_options] asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" From b88ebed0089bed592ef8759c24c1b18bc772e5d2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 10:42:50 -0700 Subject: [PATCH 264/424] Added globally option for Psycopg 2 --- CHANGELOG.md | 4 ++++ pgvector/psycopg2/halfvec.py | 4 ++-- pgvector/psycopg2/register.py | 10 ++++++---- pgvector/psycopg2/sparsevec.py | 4 ++-- pgvector/psycopg2/vector.py | 4 ++-- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51ea394..e6fd85c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.5 (unreleased) + +- Added `globally` option for Psycopg 2 + ## 0.3.4 (2024-09-26) - Added `schema` option for asyncpg diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index e3c0cdf..0fd66c1 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -14,7 +14,7 @@ def cast_halfvec(value, cur): return HalfVector._from_db(value) -def register_halfvec_info(oid): +def register_halfvec_info(oid, scope): halfvec = new_type((oid,), 'HALFVEC', cast_halfvec) - register_type(halfvec) + register_type(halfvec, scope) register_adapter(HalfVector, HalfvecAdapter) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 60a7aa7..ae457f8 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,9 +5,11 @@ from .vector import register_vector_info -def register_vector(conn_or_curs=None): +# TODO make globally False by default in 0.4.0 +def register_vector(conn_or_curs=None, globally=True): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) + scope = None if globally else conn_or_curs # use to_regtype to get first matching type in search path cur.execute("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") @@ -16,10 +18,10 @@ def register_vector(conn_or_curs=None): if 'vector' not in type_info: raise psycopg2.ProgrammingError('vector type not found in the database') - register_vector_info(type_info['vector']) + register_vector_info(type_info['vector'], scope) if 'halfvec' in type_info: - register_halfvec_info(type_info['halfvec']) + register_halfvec_info(type_info['halfvec'], scope) if 'sparsevec' in type_info: - register_sparsevec_info(type_info['sparsevec']) + register_sparsevec_info(type_info['sparsevec'], scope) diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index 7cdf38c..03c7f7c 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -14,7 +14,7 @@ def cast_sparsevec(value, cur): return SparseVector._from_db(value) -def register_sparsevec_info(oid): +def register_sparsevec_info(oid, scope): sparsevec = new_type((oid,), 'SPARSEVEC', cast_sparsevec) - register_type(sparsevec) + register_type(sparsevec, scope) register_adapter(SparseVector, SparsevecAdapter) diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index c895f86..28c2f00 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -15,7 +15,7 @@ def cast_vector(value, cur): return Vector._from_db(value) -def register_vector_info(oid): +def register_vector_info(oid, scope): vector = new_type((oid,), 'VECTOR', cast_vector) - register_type(vector) + register_type(vector, scope) register_adapter(np.ndarray, VectorAdapter) From 70ae49a5e2e7e6c393f3b37041049091cf3007d5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 10:48:09 -0700 Subject: [PATCH 265/424] Updated tests to use globally=False --- tests/test_peewee.py | 12 ++++++++---- tests/test_psycopg2.py | 6 +++--- tests/test_sqlalchemy.py | 6 ++++-- tests/test_sqlmodel.py | 6 ++++-- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 0882890..72502db 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -169,7 +169,8 @@ def test_vector_avg(self): Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) avg = Item.select(fn.avg(Item.embedding)).scalar() - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + # does not type cast + assert avg == '[2.5,3.5,4.5]' def test_vector_sum(self): sum = Item.select(fn.sum(Item.embedding)).scalar() @@ -177,7 +178,8 @@ def test_vector_sum(self): Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) sum = Item.select(fn.sum(Item.embedding)).scalar() - assert np.array_equal(sum, np.array([5, 7, 9])) + # does not type cast + assert sum == '[5,7,9]' def test_halfvec_avg(self): avg = Item.select(fn.avg(Item.half_embedding)).scalar() @@ -185,7 +187,8 @@ def test_halfvec_avg(self): Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) avg = Item.select(fn.avg(Item.half_embedding)).scalar() - assert avg.to_list() == [2.5, 3.5, 4.5] + # does not type cast + assert avg == '[2.5,3.5,4.5]' def test_halfvec_sum(self): sum = Item.select(fn.sum(Item.half_embedding)).scalar() @@ -193,7 +196,8 @@ def test_halfvec_sum(self): Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) sum = Item.select(fn.sum(Item.half_embedding)).scalar() - assert sum.to_list() == [5, 7, 9] + # does not type cast + assert sum == '[5,7,9]' def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 24e99d6..9e5100c 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -11,7 +11,7 @@ cur.execute('DROP TABLE IF EXISTS psycopg2_items') cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') -register_vector(cur) +register_vector(cur, globally=False) class TestPsycopg2: @@ -59,11 +59,11 @@ def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test') cur = conn.cursor(cursor_factory=cursor_factory) - register_vector(cur) + register_vector(cur, globally=False) conn.close() def test_cursor_factory_connection(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) - register_vector(conn) + register_vector(conn, globally=False) conn.close() diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index edce3dc..d7d0756 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -337,7 +337,8 @@ def test_avg(self): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) avg = session.query(func.avg(Item.embedding)).first()[0] - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + # does not type cast + assert avg == '[2.5,3.5,4.5]' def test_avg_orm(self): with Session(engine) as session: @@ -346,7 +347,8 @@ def test_avg_orm(self): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) avg = session.scalars(select(func.avg(Item.embedding))).first() - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + # does not type cast + assert avg == '[2.5,3.5,4.5]' def test_sum(self): with Session(engine) as session: diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 5685ce6..e1716e1 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -203,7 +203,8 @@ def test_vector_avg(self): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) avg = session.exec(select(func.avg(Item.embedding))).first() - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + # does not type cast + assert avg == '[2.5,3.5,4.5]' def test_vector_sum(self): with Session(engine) as session: @@ -221,7 +222,8 @@ def test_halfvec_avg(self): session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) avg = session.exec(select(func.avg(Item.half_embedding))).first() - assert avg.to_list() == [2.5, 3.5, 4.5] + # does not type cast + assert avg == '[2.5,3.5,4.5]' def test_halfvec_sum(self): with Session(engine) as session: From d0aa5d3769ae05babdb03c74d6aabc03f7f62330 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 11:32:10 -0700 Subject: [PATCH 266/424] Fixed indentation [skip ci] --- tests/test_sqlmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index e1716e1..1211d8c 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -203,7 +203,7 @@ def test_vector_avg(self): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) avg = session.exec(select(func.avg(Item.embedding))).first() - # does not type cast + # does not type cast assert avg == '[2.5,3.5,4.5]' def test_vector_sum(self): From 8879d95c1a75434cc413c238dec550e3c8167ec4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 11:46:37 -0700 Subject: [PATCH 267/424] Added note about register_adapter [skip ci] --- pgvector/psycopg2/register.py | 1 + tests/test_sqlalchemy.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index ae457f8..95d03a4 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -6,6 +6,7 @@ # TODO make globally False by default in 0.4.0 +# note: register_adapter is always global def register_vector(conn_or_curs=None, globally=True): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index d7d0756..9419ee5 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -403,8 +403,9 @@ def test_insert(self): def test_insert_bulk(self): session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) - def test_insert_text(self): - session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + # register_vector in psycopg2 tests change this behavior + # def test_insert_text(self): + # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) def test_automap(self): metadata = MetaData() From 89ec21dac00fb4e4836931e8eead4311d4bf5114 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 11:47:39 -0700 Subject: [PATCH 268/424] Updated tests to support SQLAlchemy 1 --- tests/test_sqlalchemy.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 9419ee5..77f5ab3 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -4,10 +4,17 @@ from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError from sqlalchemy.ext.automap import automap_base -from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine -from sqlalchemy.orm import declarative_base, mapped_column, Session +from sqlalchemy.orm import declarative_base, Session from sqlalchemy.sql import func +try: + from sqlalchemy.orm import mapped_column + from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + sqlalchemy_version = 2 +except ImportError: + mapped_column = Column + sqlalchemy_version = 1 + engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') with Session(engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) @@ -418,6 +425,7 @@ def test_automap(self): assert item.embedding.tolist() == [1, 2, 3] @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_async(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From f65c3614c07184f73e0af854e92f89a928f8c3e6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 12:40:50 -0700 Subject: [PATCH 269/424] Fixed type casting for Peewee aggregations [skip ci] --- README.md | 2 +- tests/test_peewee.py | 28 ++++++++++++---------------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f68fda4..4ed1b7a 100644 --- a/README.md +++ b/README.md @@ -496,7 +496,7 @@ Average vectors ```python from peewee import fn -Item.select(fn.avg(Item.embedding)).scalar() +Item.select(fn.avg(Item.embedding).coerce(True)).scalar() ``` Also supports `sum` diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 72502db..8785b60 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -164,40 +164,36 @@ def test_where(self): assert [v.id for v in items] == [1] def test_vector_avg(self): - avg = Item.select(fn.avg(Item.embedding)).scalar() + avg = Item.select(fn.avg(Item.embedding).coerce(True)).scalar() assert avg is None Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) - avg = Item.select(fn.avg(Item.embedding)).scalar() - # does not type cast - assert avg == '[2.5,3.5,4.5]' + avg = Item.select(fn.avg(Item.embedding).coerce(True)).scalar() + assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) def test_vector_sum(self): - sum = Item.select(fn.sum(Item.embedding)).scalar() + sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() assert sum is None Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) - sum = Item.select(fn.sum(Item.embedding)).scalar() - # does not type cast - assert sum == '[5,7,9]' + sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() + assert np.array_equal(sum, np.array([5, 7, 9])) def test_halfvec_avg(self): - avg = Item.select(fn.avg(Item.half_embedding)).scalar() + avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() assert avg is None Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) - avg = Item.select(fn.avg(Item.half_embedding)).scalar() - # does not type cast - assert avg == '[2.5,3.5,4.5]' + avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() + assert avg.to_list() == [2.5, 3.5, 4.5] def test_halfvec_sum(self): - sum = Item.select(fn.sum(Item.half_embedding)).scalar() + sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() assert sum is None Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) - sum = Item.select(fn.sum(Item.half_embedding)).scalar() - # does not type cast - assert sum == '[5,7,9]' + sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() + assert sum.to_list() == [5, 7, 9] def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) From e90260ae9a04ec776eba435b9bbc21cec008b570 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 13:00:15 -0700 Subject: [PATCH 270/424] Added avg function with type casting to SQLAlchemy - #44 Co-authored-by: lucasgadams --- CHANGELOG.md | 1 + pgvector/sqlalchemy/__init__.py | 5 ++++- pgvector/sqlalchemy/functions.py | 8 +++++++ tests/test_sqlalchemy.py | 36 +++++++++++++++----------------- tests/test_sqlmodel.py | 36 +++++++++++++++----------------- 5 files changed, 47 insertions(+), 39 deletions(-) create mode 100644 pgvector/sqlalchemy/functions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e6fd85c..38b1830 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.3.5 (unreleased) +- Added `avg` function with type casting to SQLAlchemy - Added `globally` option for Psycopg 2 ## 0.3.4 (2024-09-26) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 67b1d16..382f53f 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,4 +1,5 @@ from .bit import BIT +from .functions import avg, sum from .halfvec import HALFVEC from .sparsevec import SPARSEVEC from .vector import VECTOR @@ -12,5 +13,7 @@ 'BIT', 'SPARSEVEC', 'HalfVector', - 'SparseVector' + 'SparseVector', + 'avg', + 'sum' ] diff --git a/pgvector/sqlalchemy/functions.py b/pgvector/sqlalchemy/functions.py new file mode 100644 index 0000000..8f7ecc8 --- /dev/null +++ b/pgvector/sqlalchemy/functions.py @@ -0,0 +1,8 @@ +# https://docs.sqlalchemy.org/en/20/core/functions.html +# include sum for a consistent API +from sqlalchemy.sql.functions import ReturnTypeFromArgs, sum + + +class avg(ReturnTypeFromArgs): + inherit_cache = True + package = 'pgvector' diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 77f5ab3..1c84108 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer from sqlalchemy.exc import StatementError @@ -339,41 +339,39 @@ def test_select_orm(self): def test_avg(self): with Session(engine) as session: - avg = session.query(func.avg(Item.embedding)).first()[0] - assert avg is None + res = session.query(avg(Item.embedding)).first()[0] + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - avg = session.query(func.avg(Item.embedding)).first()[0] - # does not type cast - assert avg == '[2.5,3.5,4.5]' + res = session.query(avg(Item.embedding)).first()[0] + assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) def test_avg_orm(self): with Session(engine) as session: - avg = session.scalars(select(func.avg(Item.embedding))).first() - assert avg is None + res = session.scalars(select(avg(Item.embedding))).first() + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - avg = session.scalars(select(func.avg(Item.embedding))).first() - # does not type cast - assert avg == '[2.5,3.5,4.5]' + res = session.scalars(select(avg(Item.embedding))).first() + assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) def test_sum(self): with Session(engine) as session: - sum = session.query(func.sum(Item.embedding)).first()[0] - assert sum is None + res = session.query(sum(Item.embedding)).first()[0] + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - sum = session.query(func.sum(Item.embedding)).first()[0] - assert np.array_equal(sum, np.array([5, 7, 9])) + res = session.query(sum(Item.embedding)).first()[0] + assert np.array_equal(res, np.array([5, 7, 9])) def test_sum_orm(self): with Session(engine) as session: - sum = session.scalars(select(func.sum(Item.embedding))).first() - assert sum is None + res = session.scalars(select(sum(Item.embedding))).first() + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - sum = session.scalars(select(func.sum(Item.embedding))).first() - assert np.array_equal(sum, np.array([5, 7, 9])) + res = session.scalars(select(sum(Item.embedding))).first() + assert np.array_equal(res, np.array([5, 7, 9])) def test_bad_dimensions(self): item = Item(embedding=[1, 2]) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 1211d8c..f6f26d6 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError @@ -198,41 +198,39 @@ def test_select(self): def test_vector_avg(self): with Session(engine) as session: - avg = session.exec(select(func.avg(Item.embedding))).first() - assert avg is None + res = session.exec(select(avg(Item.embedding))).first() + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - avg = session.exec(select(func.avg(Item.embedding))).first() - # does not type cast - assert avg == '[2.5,3.5,4.5]' + res = session.exec(select(avg(Item.embedding))).first() + assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) def test_vector_sum(self): with Session(engine) as session: - sum = session.exec(select(func.sum(Item.embedding))).first() - assert sum is None + res = session.exec(select(sum(Item.embedding))).first() + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - sum = session.exec(select(func.sum(Item.embedding))).first() - assert np.array_equal(sum, np.array([5, 7, 9])) + res = session.exec(select(sum(Item.embedding))).first() + assert np.array_equal(res, np.array([5, 7, 9])) def test_halfvec_avg(self): with Session(engine) as session: - avg = session.exec(select(func.avg(Item.half_embedding))).first() - assert avg is None + res = session.exec(select(avg(Item.half_embedding))).first() + assert res is None session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) - avg = session.exec(select(func.avg(Item.half_embedding))).first() - # does not type cast - assert avg == '[2.5,3.5,4.5]' + res = session.exec(select(avg(Item.half_embedding))).first() + assert res.to_list() == [2.5, 3.5, 4.5] def test_halfvec_sum(self): with Session(engine) as session: - sum = session.exec(select(func.sum(Item.half_embedding))).first() - assert sum is None + res = session.exec(select(sum(Item.half_embedding))).first() + assert res is None session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) - sum = session.exec(select(func.sum(Item.half_embedding))).first() - assert sum.to_list() == [5, 7, 9] + res = session.exec(select(sum(Item.half_embedding))).first() + assert res.to_list() == [5, 7, 9] def test_bad_dimensions(self): item = Item(embedding=[1, 2]) From 6bc66f828943c4d93e8c89f9b18ec5738b251d40 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 2 Oct 2024 13:12:25 -0700 Subject: [PATCH 271/424] Wait to add avg and sum to __all__ [skip ci] --- pgvector/sqlalchemy/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 382f53f..6377c2c 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -14,6 +14,7 @@ 'SPARSEVEC', 'HalfVector', 'SparseVector', - 'avg', - 'sum' + # TODO add avg and sum in 0.4.0 + # 'avg', + # 'sum' ] From 107b81efd0144564483fa59ee3e7acdf814cb6f3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 5 Oct 2024 10:43:36 -0700 Subject: [PATCH 272/424] Removed unused code [skip ci] --- examples/sparse_search/example.py | 1 - pgvector/peewee/bit.py | 2 +- pgvector/peewee/halfvec.py | 2 +- pgvector/peewee/sparsevec.py | 2 +- pgvector/peewee/vector.py | 2 +- pgvector/psycopg/bit.py | 2 +- pgvector/psycopg/register.py | 1 - tests/test_asyncpg.py | 1 - tests/test_half_vector.py | 2 +- tests/test_sqlmodel.py | 1 - tests/test_vector.py | 2 +- 11 files changed, 7 insertions(+), 11 deletions(-) diff --git a/examples/sparse_search/example.py b/examples/sparse_search/example.py index 6ce33e8..fa6074e 100644 --- a/examples/sparse_search/example.py +++ b/examples/sparse_search/example.py @@ -2,7 +2,6 @@ # https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/ # https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1 -import numpy as np from pgvector.psycopg import register_vector, SparseVector import psycopg import torch diff --git a/pgvector/peewee/bit.py b/pgvector/peewee/bit.py index 8a3bfbc..ee5f12f 100644 --- a/pgvector/peewee/bit.py +++ b/pgvector/peewee/bit.py @@ -1,4 +1,4 @@ -from peewee import Expression, Field, Value +from peewee import Expression, Field class FixedBitField(Field): diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index bed7d1f..deaa14d 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -1,4 +1,4 @@ -from peewee import Expression, Field, Value +from peewee import Expression, Field from ..utils import HalfVector diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index 8bba5cf..67f7d1b 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -1,4 +1,4 @@ -from peewee import Expression, Field, Value +from peewee import Expression, Field from ..utils import SparseVector diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py index a9ebf87..22a87e5 100644 --- a/pgvector/peewee/vector.py +++ b/pgvector/peewee/vector.py @@ -1,4 +1,4 @@ -from peewee import Expression, Field, Value +from peewee import Expression, Field from ..utils import Vector diff --git a/pgvector/psycopg/bit.py b/pgvector/psycopg/bit.py index 605c6f0..f8eeb61 100644 --- a/pgvector/psycopg/bit.py +++ b/pgvector/psycopg/bit.py @@ -1,4 +1,4 @@ -from psycopg.adapt import Loader, Dumper +from psycopg.adapt import Dumper from psycopg.pq import Format from ..utils import Bit diff --git a/pgvector/psycopg/register.py b/pgvector/psycopg/register.py index 7f54a31..b93fd3e 100644 --- a/pgvector/psycopg/register.py +++ b/pgvector/psycopg/register.py @@ -1,4 +1,3 @@ -import psycopg from psycopg.types import TypeInfo from .bit import register_bit_info from .halfvec import register_halfvec_info diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 829883e..e1b6250 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,4 +1,3 @@ -import asyncio import asyncpg import numpy as np from pgvector.asyncpg import register_vector, SparseVector diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 2a0d3a3..fdaa5f7 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -8,7 +8,7 @@ def test_list(self): assert HalfVector([1, 2, 3]).to_list() == [1, 2, 3] def test_list_str(self): - with pytest.raises(ValueError, match='could not convert string to float') as error: + with pytest.raises(ValueError, match='could not convert string to float'): HalfVector([1, 'two', 3]) def test_tuple(self): diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index f6f26d6..4cb0e9b 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -3,7 +3,6 @@ import pytest from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError -from sqlalchemy.sql import func from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text from typing import Any, Optional diff --git a/tests/test_vector.py b/tests/test_vector.py index 90d6a9b..1be2bc0 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -8,7 +8,7 @@ def test_list(self): assert Vector([1, 2, 3]).to_list() == [1, 2, 3] def test_list_str(self): - with pytest.raises(ValueError, match='could not convert string to float') as error: + with pytest.raises(ValueError, match='could not convert string to float'): Vector([1, 'two', 3]) def test_tuple(self): From 75a5d50a253c0cafbff8f6cb8d6ad9ad3b6c015b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 5 Oct 2024 11:03:42 -0700 Subject: [PATCH 273/424] Improved code [skip ci] --- pgvector/asyncpg/__init__.py | 1 + pgvector/sqlalchemy/__init__.py | 7 +++---- pgvector/sqlalchemy/functions.py | 6 ++++++ pgvector/utils/__init__.py | 15 +++++++++++---- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index f4e8754..543b882 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -3,6 +3,7 @@ __all__ = [ 'register_vector', + 'Vector', 'HalfVector', 'SparseVector' ] diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 6377c2c..4955eeb 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -8,13 +8,12 @@ __all__ = [ 'Vector', - 'VECTOR' + 'VECTOR', 'HALFVEC', 'BIT', 'SPARSEVEC', 'HalfVector', 'SparseVector', - # TODO add avg and sum in 0.4.0 - # 'avg', - # 'sum' + 'avg', + 'sum' ] diff --git a/pgvector/sqlalchemy/functions.py b/pgvector/sqlalchemy/functions.py index 8f7ecc8..72e3ca7 100644 --- a/pgvector/sqlalchemy/functions.py +++ b/pgvector/sqlalchemy/functions.py @@ -6,3 +6,9 @@ class avg(ReturnTypeFromArgs): inherit_cache = True package = 'pgvector' + + +__all__ = [ + 'avg', + 'sum' +] diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 69e8cfb..3c01160 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,4 +1,11 @@ -from .bit import * -from .halfvec import * -from .sparsevec import * -from .vector import * +from .bit import Bit +from .halfvec import HalfVector +from .sparsevec import SparseVector +from .vector import Vector + +__all__ = [ + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] From 43b809fa519e9822406d90ad2630463c1ecea245 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 5 Oct 2024 18:19:25 -0700 Subject: [PATCH 274/424] Version bump to 0.3.5 [skip ci] --- CHANGELOG.md | 2 +- README.md | 4 ++-- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38b1830..b89f3a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.5 (unreleased) +## 0.3.5 (2024-10-05) - Added `avg` function with type casting to SQLAlchemy - Added `globally` option for Psycopg 2 diff --git a/README.md b/README.md index 4ed1b7a..613c387 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ session.scalars(select(Item).filter(Item.embedding.l2_distance([3, 1, 2]) < 5)) Average vectors ```python -from sqlalchemy.sql import func +from pgvector.sqlalchemy import avg session.scalars(select(func.avg(Item.embedding))).first() ``` @@ -265,7 +265,7 @@ session.exec(select(Item).filter(Item.embedding.l2_distance([3, 1, 2]) < 5)) Average vectors ```python -from sqlalchemy.sql import func +from pgvector.sqlalchemy import avg session.exec(select(func.avg(Item.embedding))).first() ``` diff --git a/pyproject.toml b/pyproject.toml index 25fcacc..cceeab0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.3.4" +version = "0.3.5" description = "pgvector support for Python" readme = "README.md" authors = [ From 0852a1f9d2a3ae0a4d90c891d73c2a466d3361d9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 15:15:36 -0700 Subject: [PATCH 275/424] Added test for arrays with SQLAlchemy - #96 --- tests/test_sqlalchemy.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 1c84108..5f0c3ef 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest -from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer +from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY from sqlalchemy.exc import StatementError from sqlalchemy.ext.automap import automap_base from sqlalchemy.orm import declarative_base, Session @@ -31,6 +31,7 @@ class Item(Base): half_embedding = mapped_column(HALFVEC(3)) binary_embedding = mapped_column(BIT(3)) sparse_embedding = mapped_column(SPARSEVEC(3)) + embeddings = mapped_column(ARRAY(VECTOR(3))) Base.metadata.drop_all(engine) @@ -70,7 +71,8 @@ def test_core(self): Column('embedding', VECTOR(3)), Column('half_embedding', HALFVEC(3)), Column('binary_embedding', BIT(3)), - Column('sparse_embedding', SPARSEVEC(3)) + Column('sparse_embedding', SPARSEVEC(3)), + Column('embeddings', ARRAY(VECTOR(3))) ) metadata.drop_all(engine) @@ -422,6 +424,14 @@ def test_automap(self): item = session.query(AutoItem).first() assert item.embedding.tolist() == [1, 2, 3] + def test_vector_array(self): + session = Session(engine) + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() + + # this fails if the driver does not cast arrays + # item = session.get(Item, 1) + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_async(self): From 82b7ab9922a9499c114035b7b8e0e8fc1c12a0a1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 15:35:21 -0700 Subject: [PATCH 276/424] Added arrays option for Psycopg 2 --- CHANGELOG.md | 4 ++++ pgvector/psycopg2/halfvec.py | 9 +++++++-- pgvector/psycopg2/register.py | 11 ++++++----- pgvector/psycopg2/sparsevec.py | 9 +++++++-- pgvector/psycopg2/vector.py | 9 +++++++-- tests/test_psycopg2.py | 33 ++++++++++++++++++++++++++++++--- 6 files changed, 61 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b89f3a0..efaa85c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.6 (unreleased) + +- Added `arrays` option for Psycopg 2 + ## 0.3.5 (2024-10-05) - Added `avg` function with type casting to SQLAlchemy diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index 0fd66c1..b50e89b 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -1,4 +1,4 @@ -from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type from ..utils import HalfVector @@ -14,7 +14,12 @@ def cast_halfvec(value, cur): return HalfVector._from_db(value) -def register_halfvec_info(oid, scope): +def register_halfvec_info(oid, array_oid, scope): halfvec = new_type((oid,), 'HALFVEC', cast_halfvec) register_type(halfvec, scope) + + if array_oid is not None: + halfvecarray = new_array_type((array_oid,), 'HALFVECARRAY', halfvec) + register_type(halfvecarray, scope) + register_adapter(HalfVector, HalfvecAdapter) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 95d03a4..7752852 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -7,22 +7,23 @@ # TODO make globally False by default in 0.4.0 # note: register_adapter is always global -def register_vector(conn_or_curs=None, globally=True): +# TODO make arrays True by defalt in 0.4.0 +def register_vector(conn_or_curs=None, globally=True, arrays=False): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) scope = None if globally else conn_or_curs # use to_regtype to get first matching type in search path - cur.execute("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") + cur.execute("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('_vector'), to_regtype('halfvec'), to_regtype('_halfvec'), to_regtype('sparsevec'), to_regtype('_sparsevec'))") type_info = dict(cur.fetchall()) if 'vector' not in type_info: raise psycopg2.ProgrammingError('vector type not found in the database') - register_vector_info(type_info['vector'], scope) + register_vector_info(type_info['vector'], type_info['_vector'] if arrays else None, scope) if 'halfvec' in type_info: - register_halfvec_info(type_info['halfvec'], scope) + register_halfvec_info(type_info['halfvec'], type_info['_halfvec'] if arrays else None, scope) if 'sparsevec' in type_info: - register_sparsevec_info(type_info['sparsevec'], scope) + register_sparsevec_info(type_info['sparsevec'], type_info['_sparsevec'] if arrays else None, scope) diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index 03c7f7c..a542807 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -1,4 +1,4 @@ -from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type from ..utils import SparseVector @@ -14,7 +14,12 @@ def cast_sparsevec(value, cur): return SparseVector._from_db(value) -def register_sparsevec_info(oid, scope): +def register_sparsevec_info(oid, array_oid, scope): sparsevec = new_type((oid,), 'SPARSEVEC', cast_sparsevec) register_type(sparsevec, scope) + + if array_oid is not None: + sparsevecarray = new_array_type((array_oid,), 'SPARSEVECARRAY', sparsevec) + register_type(sparsevecarray, scope) + register_adapter(SparseVector, SparsevecAdapter) diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 28c2f00..9861f01 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -1,5 +1,5 @@ import numpy as np -from psycopg2.extensions import adapt, new_type, register_adapter, register_type +from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type from ..utils import Vector @@ -15,7 +15,12 @@ def cast_vector(value, cur): return Vector._from_db(value) -def register_vector_info(oid, scope): +def register_vector_info(oid, array_oid, scope): vector = new_type((oid,), 'VECTOR', cast_vector) register_type(vector, scope) + + if array_oid is not None: + vectorarray = new_array_type((array_oid,), 'VECTORARRAY', vector) + register_type(vectorarray, scope) + register_adapter(np.ndarray, VectorAdapter) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 9e5100c..c93fce4 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg2 import register_vector, SparseVector +from pgvector.psycopg2 import register_vector, HalfVector, SparseVector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor @@ -9,9 +9,9 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') cur.execute('DROP TABLE IF EXISTS psycopg2_items') -cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') +cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') -register_vector(cur, globally=False) +register_vector(cur, globally=False, arrays=True) class TestPsycopg2: @@ -55,6 +55,33 @@ def test_sparsevec(self): assert res[0][0].to_list() == [1.5, 2, 3] assert res[1][0] is None + def test_vector_array(self): + embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + cur.execute('INSERT INTO psycopg2_items (embeddings) VALUES (%s::vector[])', (embeddings,)) + + cur.execute('SELECT embeddings FROM psycopg2_items ORDER BY id') + res = cur.fetchone() + assert np.array_equal(res[0][0], embeddings[0]) + assert np.array_equal(res[0][1], embeddings[1]) + + def test_halfvec_array(self): + embeddings = [HalfVector([1.5, 2, 3]), HalfVector([4.5, 5, 6])] + cur.execute('INSERT INTO psycopg2_items (half_embeddings) VALUES (%s::halfvec[])', (embeddings,)) + + cur.execute('SELECT half_embeddings FROM psycopg2_items ORDER BY id') + res = cur.fetchone() + assert res[0][0].to_list() == [1.5, 2, 3] + assert res[0][1].to_list() == [4.5, 5, 6] + + def test_sparsevec_array(self): + embeddings = [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] + cur.execute('INSERT INTO psycopg2_items (sparse_embeddings) VALUES (%s::sparsevec[])', (embeddings,)) + + cur.execute('SELECT sparse_embeddings FROM psycopg2_items ORDER BY id') + res = cur.fetchone() + assert res[0][0].to_list() == [1.5, 2, 3] + assert res[0][1].to_list() == [4.5, 5, 6] + def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test') From 7887a34d5f1914592fcbd52be13b18b7ed43d39e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 15:51:16 -0700 Subject: [PATCH 277/424] Added test for arrays with asyncpg --- tests/test_asyncpg.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index e1b6250..7a68a9e 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -94,6 +94,24 @@ async def test_sparsevec(self): await conn.close() + @pytest.mark.asyncio + async def test_vector_array(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embeddings vector[])') + + await register_vector(conn) + + embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[])", embeddings[0], embeddings[1]) + + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert np.array_equal(res[0]['embeddings'][0], embeddings[0]) + assert np.array_equal(res[0]['embeddings'][1], embeddings[1]) + + await conn.close() + @pytest.mark.asyncio async def test_pool(self): async def init(conn): From ba393fe2711142a49a8654304f9fb1051b1e2570 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 15:59:01 -0700 Subject: [PATCH 278/424] Added test for arrays with Django --- tests/test_django.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_django.py b/tests/test_django.py index 06d86a4..6990141 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -1,5 +1,6 @@ import django from django.conf import settings +from django.contrib.postgres.fields import ArrayField from django.core import serializers from django.db import connection, migrations, models from django.db.models import Avg, Sum @@ -46,6 +47,7 @@ class Item(models.Model): half_embedding = HalfVectorField(dimensions=3, null=True, blank=True) binary_embedding = BitField(length=3, null=True, blank=True) sparse_embedding = SparseVectorField(dimensions=3, null=True, blank=True) + embeddings = ArrayField(VectorField(dimensions=3), null=True, blank=True) class Meta: app_label = 'django_app' @@ -82,6 +84,7 @@ class Migration(migrations.Migration): ('half_embedding', pgvector.django.HalfVectorField(dimensions=3, null=True, blank=True)), ('binary_embedding', pgvector.django.BitField(length=3, null=True, blank=True)), ('sparse_embedding', pgvector.django.SparseVectorField(dimensions=3, null=True, blank=True)), + ('embeddings', ArrayField(pgvector.django.VectorField(dimensions=3), null=True, blank=True)), ], ), migrations.AddIndex( @@ -433,3 +436,9 @@ def test_missing(self): assert Item.objects.first().half_embedding is None assert Item.objects.first().binary_embedding is None assert Item.objects.first().sparse_embedding is None + + def test_vector_array(self): + Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]).save() + + # this fails if the driver does not cast arrays + # item = Item.objects.get(pk=1) From c6c7d765fb84eaa29e7d7fd974d8aeaaa0734871 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 16:18:30 -0700 Subject: [PATCH 279/424] Added test for arrays with Peewee [skip ci] --- tests/test_peewee.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 8785b60..9666388 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -199,3 +199,25 @@ def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) Item.get_or_create(embedding=np.array([4, 5, 6])) Item.get_or_create(embedding=Item.embedding.to_value([7, 8, 9])) + + def test_vector_array(self): + from playhouse.postgres_ext import PostgresqlExtDatabase, ArrayField + + ext_db = PostgresqlExtDatabase('pgvector_python_test') + + class ExtItem(BaseModel): + embeddings = ArrayField(VectorField, field_kwargs={'dimensions': 3}, index=False) + + class Meta: + database = ext_db + table_name = 'peewee_ext_item' + + ext_db.connect() + ext_db.drop_tables([ExtItem]) + ext_db.create_tables([ExtItem]) + + # fails with column "embeddings" is of type vector[] but expression is of type text[] + # ExtItem.create(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]) + # item = ExtItem.get_by_id(1) + # assert np.array_equal(item.embeddings[0], np.array([1, 2, 3])) + # assert np.array_equal(item.embeddings[1], np.array([4, 5, 6])) From 2ff631e1e947954313d7d35426651c4788115057 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 16:25:51 -0700 Subject: [PATCH 280/424] Improved test for arrays with SQLAlchemy [skip ci] --- tests/test_sqlalchemy.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 5f0c3ef..e783ac0 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -425,12 +425,19 @@ def test_automap(self): assert item.embedding.tolist() == [1, 2, 3] def test_vector_array(self): + from pgvector.psycopg2 import register_vector + session = Session(engine) session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - # this fails if the driver does not cast arrays - # item = session.get(Item, 1) + with engine.connect() as connection: + register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) + + # this fails if the driver does not cast arrays + item = Session(bind=connection).get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') From 417f3ec7de0c8aaac7ce53ef8089ce7b0baa2483 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 16:26:52 -0700 Subject: [PATCH 281/424] Improved test [skip ci] --- tests/test_sqlalchemy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index e783ac0..1ca0ea3 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -425,13 +425,12 @@ def test_automap(self): assert item.embedding.tolist() == [1, 2, 3] def test_vector_array(self): - from pgvector.psycopg2 import register_vector - session = Session(engine) session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() with engine.connect() as connection: + from pgvector.psycopg2 import register_vector register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) # this fails if the driver does not cast arrays From f2f30e1dd2691523a68ef347f847b38b44f5fa79 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Oct 2024 16:30:50 -0700 Subject: [PATCH 282/424] Improved test for arrays with Django [skip ci] --- tests/test_django.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 6990141..bbeee11 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -440,5 +440,11 @@ def test_missing(self): def test_vector_array(self): Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]).save() - # this fails if the driver does not cast arrays - # item = Item.objects.get(pk=1) + with connection.cursor() as cursor: + from pgvector.psycopg import register_vector + register_vector(cursor.connection) + + # this fails if the driver does not cast arrays + item = Item.objects.get(pk=1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] From 59a3efcbb00c1139a53fe0777e3400e15f4559f4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 16 Oct 2024 23:05:48 -0700 Subject: [PATCH 283/424] Updated readme [skip ci] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 613c387..acd625d 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ Average vectors ```python from pgvector.sqlalchemy import avg -session.scalars(select(func.avg(Item.embedding))).first() +session.scalars(select(avg(Item.embedding))).first() ``` Also supports `sum` @@ -267,7 +267,7 @@ Average vectors ```python from pgvector.sqlalchemy import avg -session.exec(select(func.avg(Item.embedding))).first() +session.exec(select(avg(Item.embedding))).first() ``` Also supports `sum` From 79d41119dabd2211601807263cee874c3a043d9f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Oct 2024 13:21:05 -0700 Subject: [PATCH 284/424] Added tests for double and numeric arrays --- tests/test_django.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/test_django.py b/tests/test_django.py index bbeee11..5ab5f81 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -3,7 +3,8 @@ from django.contrib.postgres.fields import ArrayField from django.core import serializers from django.db import connection, migrations, models -from django.db.models import Avg, Sum +from django.db.models import Avg, Sum, FloatField, DecimalField +from django.db.models.functions import Cast from django.db.migrations.loader import MigrationLoader from django.forms import ModelForm from math import sqrt @@ -48,6 +49,8 @@ class Item(models.Model): binary_embedding = BitField(length=3, null=True, blank=True) sparse_embedding = SparseVectorField(dimensions=3, null=True, blank=True) embeddings = ArrayField(VectorField(dimensions=3), null=True, blank=True) + double_embedding = ArrayField(FloatField(), null=True, blank=True) + numeric_embedding = ArrayField(DecimalField(max_digits=20, decimal_places=10), null=True, blank=True) class Meta: app_label = 'django_app' @@ -85,6 +88,8 @@ class Migration(migrations.Migration): ('binary_embedding', pgvector.django.BitField(length=3, null=True, blank=True)), ('sparse_embedding', pgvector.django.SparseVectorField(dimensions=3, null=True, blank=True)), ('embeddings', ArrayField(pgvector.django.VectorField(dimensions=3), null=True, blank=True)), + ('double_embedding', ArrayField(FloatField(), null=True, blank=True)), + ('numeric_embedding', ArrayField(DecimalField(max_digits=20, decimal_places=10), null=True, blank=True)), ], ), migrations.AddIndex( @@ -448,3 +453,23 @@ def test_vector_array(self): item = Item.objects.get(pk=1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + + def test_double_array(self): + Item(id=1, double_embedding=[1, 1, 1]).save() + Item(id=2, double_embedding=[2, 2, 2]).save() + Item(id=3, double_embedding=[1, 1, 2]).save() + distance = L2Distance(Cast('double_embedding', VectorField()), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + assert items[1].double_embedding == [1, 1, 2] + + def test_numeric_array(self): + Item(id=1, numeric_embedding=[1, 1, 1]).save() + Item(id=2, numeric_embedding=[2, 2, 2]).save() + Item(id=3, numeric_embedding=[1, 1, 2]).save() + distance = L2Distance(Cast('numeric_embedding', VectorField()), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + assert items[1].numeric_embedding == [1, 1, 2] From d24839b48f78059b2732f8dfb398fc1ffc8070cc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 26 Oct 2024 17:13:38 -0700 Subject: [PATCH 285/424] Version bump to 0.3.6 [skip ci] --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index efaa85c..3a517d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.6 (unreleased) +## 0.3.6 (2024-10-26) - Added `arrays` option for Psycopg 2 diff --git a/pyproject.toml b/pyproject.toml index cceeab0..a6a6609 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.3.5" +version = "0.3.6" description = "pgvector support for Python" readme = "README.md" authors = [ From 1037d7e4c05948b6b5bfc6f8d43e0e7730224f63 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 19:37:18 -0700 Subject: [PATCH 286/424] Added test for half-precision indexing with SQLAlchemy - #98 --- tests/test_sqlalchemy.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 1ca0ea3..8a032ef 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -46,6 +46,15 @@ class Item(Base): ) index.create(engine) +half_precision_index = Index( + 'sqlalchemy_orm_half_precision_index', + func.cast(Item.embedding, HALFVEC(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'halfvec_l2_ops'} +) +half_precision_index.create(engine) + def create_items(): session = Session(engine) @@ -438,6 +447,12 @@ def test_vector_array(self): assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + def test_half_precision(self): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_async(self): From 32a8d04b06b0f3e77d639e9a9ed275a67fa1e36f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 19:52:49 -0700 Subject: [PATCH 287/424] Added docs for half-precision indexing with SQLAlchemy - #98 [skip ci] --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index acd625d..bbf5973 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,29 @@ index.create(engine) Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +#### Half-Precision Indexing + +Index vectors at half-precision + +```python +from pgvector.sqlalchemy import HALFVEC +from sqlalchemy.sql import func + +index = Index( + 'my_index', + func.cast(Item.embedding, HALFVEC(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +``` + +Get the nearest neighbors + +```python +session.scalars(select(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2])).limit(5)) +``` + ## SQLModel Enable the extension From 2c8fe09f824bc509ae692d2932fe1a0bc15b6923 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 20:19:06 -0700 Subject: [PATCH 288/424] Fixed example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bbf5973..10cca79 100644 --- a/README.md +++ b/README.md @@ -227,7 +227,7 @@ index = Index( func.cast(Item.embedding, HALFVEC(3)).label('embedding'), postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, - postgresql_ops={'embedding': 'vector_l2_ops'} + postgresql_ops={'embedding': 'halfvec_l2_ops'} ) ``` From 3d9ff72a270bbfedc2d579db7f2d03b4048dfbc0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 28 Oct 2024 20:21:06 -0700 Subject: [PATCH 289/424] Improved example [skip ci] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 10cca79..917d1a7 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,8 @@ index = Index( Get the nearest neighbors ```python -session.scalars(select(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2])).limit(5)) +order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2]) +session.scalars(select(Item).order_by(order).limit(5)) ``` ## SQLModel From 25a30264599c9646b743efdfe1d28b99d6208f90 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 3 Nov 2024 19:30:28 -0800 Subject: [PATCH 290/424] Updated test [skip ci] --- tests/test_django.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_django.py b/tests/test_django.py index 5ab5f81..92f3733 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -65,7 +65,7 @@ class Meta: name='hnsw_idx', fields=['embedding'], m=16, - ef_construction=100, + ef_construction=64, opclasses=['vector_l2_ops'] ) ] From 06a48c4699486b3dc2ab843104594d0fc4539038 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 09:12:53 -0800 Subject: [PATCH 291/424] Added pool example and tests for Psycopg 3 - closes #100 --- README.md | 9 +++++++++ requirements.txt | 2 +- tests/test_psycopg.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 917d1a7..0989ba7 100644 --- a/README.md +++ b/README.md @@ -338,6 +338,15 @@ from pgvector.psycopg import register_vector register_vector(conn) ``` +For [connection pools](https://www.psycopg.org/psycopg3/docs/advanced/pool.html), use + +```python +def configure(conn): + register_vector(conn) + +pool = ConnectionPool(configure=configure) +``` + For [async connections](https://www.psycopg.org/psycopg3/docs/advanced/async.html), use ```python diff --git a/requirements.txt b/requirements.txt index c1e11f3..0e30959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ asyncpg Django numpy peewee -psycopg[binary] +psycopg[binary,pool] psycopg2-binary pytest pytest-asyncio diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index c4e1c22..5802b2b 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,6 +1,7 @@ import numpy as np from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector, Vector import psycopg +from psycopg_pool import ConnectionPool, AsyncConnectionPool import pytest conn = psycopg.connect(dbname='pgvector_python_test', autocommit=True) @@ -176,6 +177,18 @@ def test_vector_array(self): assert np.array_equal(res[0][0], embeddings[0]) assert np.array_equal(res[0][1], embeddings[1]) + def test_pool(self): + def configure(conn): + register_vector(conn) + + pool = ConnectionPool(conninfo='postgres://localhost/pgvector_python_test', open=True, configure=configure) + + with pool.connection() as conn: + res = conn.execute("SELECT '[1,2,3]'::vector").fetchone() + assert np.array_equal(res[0], np.array([1, 2, 3])) + + pool.close() + @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) @@ -195,3 +208,19 @@ async def test_async(self): assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 assert res[1][1] is None + + @pytest.mark.asyncio + async def test_async_pool(self): + async def configure(conn): + await register_vector_async(conn) + + pool = AsyncConnectionPool(conninfo='postgres://localhost/pgvector_python_test', open=False, configure=configure) + await pool.open() + + async with pool.connection() as conn: + async with conn.cursor() as cur: + await cur.execute("SELECT '[1,2,3]'::vector") + res = await cur.fetchone() + assert np.array_equal(res[0], np.array([1, 2, 3])) + + await pool.close() From 49072f2e37ff97b07b422aa0a41c4d3bd312879f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 09:33:14 -0800 Subject: [PATCH 292/424] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0989ba7..37a4737 100644 --- a/README.md +++ b/README.md @@ -344,7 +344,7 @@ For [connection pools](https://www.psycopg.org/psycopg3/docs/advanced/pool.html) def configure(conn): register_vector(conn) -pool = ConnectionPool(configure=configure) +pool = ConnectionPool(..., configure=configure) ``` For [async connections](https://www.psycopg.org/psycopg3/docs/advanced/async.html), use From 9e1c421c62a84f650cf8be73e0768bfab6715e7b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:18:41 -0800 Subject: [PATCH 293/424] Added docs and test for half-precision indexing with Django --- README.md | 25 +++++++++++++++++++++++++ tests/test_django.py | 25 ++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 37a4737..938207a 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,31 @@ class Item(models.Model): Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +#### Half-Precision Indexing + +Index vectors at half-precision + +```python +from django.contrib.postgres.indexes import OpClass +from django.db.models.functions import Cast +from pgvector.django import HalfVectorField + +index = HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='my_index', + m=16, + ef_construction=64 +) +``` + +Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` + +Get the nearest neighbors + +```python +Item.objects.order_by(L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]))[:5] +``` + ## SQLAlchemy Enable the extension diff --git a/tests/test_django.py b/tests/test_django.py index 92f3733..353087e 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -1,6 +1,7 @@ import django from django.conf import settings from django.contrib.postgres.fields import ArrayField +from django.contrib.postgres.indexes import OpClass from django.core import serializers from django.db import connection, migrations, models from django.db.models import Avg, Sum, FloatField, DecimalField @@ -38,7 +39,12 @@ 'level': 'WARNING' } } - } + }, + # needed for OpClass + # https://docs.djangoproject.com/en/5.1/ref/contrib/postgres/indexes/#opclass-expressions + INSTALLED_APPS=[ + 'django.contrib.postgres' + ] ) django.setup() @@ -67,6 +73,12 @@ class Meta: m=16, ef_construction=64, opclasses=['vector_l2_ops'] + ), + HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='hnsw_half_precision_idx', + m=16, + ef_construction=64 ) ] @@ -99,6 +111,10 @@ class Migration(migrations.Migration): migrations.AddIndex( model_name='item', index=pgvector.django.HnswIndex(fields=['embedding'], m=16, ef_construction=64, name='hnsw_idx', opclasses=['vector_l2_ops']), + ), + migrations.AddIndex( + model_name='item', + index=pgvector.django.HnswIndex(OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), m=16, ef_construction=64, name='hnsw_half_precision_idx'), ) ] @@ -473,3 +489,10 @@ def test_numeric_array(self): assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] assert items[1].numeric_embedding == [1, 1, 2] + + def test_half_precision(self): + create_items() + distance = L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] From afcd67a4422dd390e07662bfe81a8fbfab571301 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:23:17 -0800 Subject: [PATCH 294/424] Updated readme [skip ci] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 938207a..1000900 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,8 @@ Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` Get the nearest neighbors ```python -Item.objects.order_by(L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]))[:5] +distance = L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]) +Item.objects.order_by(distance)[:5] ``` ## SQLAlchemy From 441b26ec3dfbdfb6013ffdf18df083614d9fc5ff Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:30:19 -0800 Subject: [PATCH 295/424] Updated example [skip ci] --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1000900..b61059e 100644 --- a/README.md +++ b/README.md @@ -142,12 +142,16 @@ from django.contrib.postgres.indexes import OpClass from django.db.models.functions import Cast from pgvector.django import HalfVectorField -index = HnswIndex( - OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), - name='my_index', - m=16, - ef_construction=64 -) +class Item(models.Model): + class Meta: + indexes = [ + HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='my_index', + m=16, + ef_construction=64 + ) + ] ``` Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` From 78e64594843f2eb833cca77d8f43c33971806963 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 11:31:05 -0800 Subject: [PATCH 296/424] Updated example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b61059e..94fed55 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ Index vectors at half-precision ```python from django.contrib.postgres.indexes import OpClass from django.db.models.functions import Cast -from pgvector.django import HalfVectorField +from pgvector.django import HnswIndex, HalfVectorField class Item(models.Model): class Meta: From 75e14d80c80975938c0f1c64f59901686f1cd24b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:02:54 -0800 Subject: [PATCH 297/424] Added pool test for Psycopg 2 --- tests/test_psycopg2.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index c93fce4..85f08aa 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -2,6 +2,7 @@ from pgvector.psycopg2 import register_vector, HalfVector, SparseVector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor +from psycopg2.pool import ThreadedConnectionPool conn = psycopg2.connect(dbname='pgvector_python_test') conn.autocommit = True @@ -94,3 +95,21 @@ def test_cursor_factory_connection(self): conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) register_vector(conn, globally=False) conn.close() + + def test_pool(self): + pool = ThreadedConnectionPool(1, 3, dbname='pgvector_python_test') + + conn = pool.getconn() + try: + cur = conn.cursor() + + # use globally=True for apps + register_vector(cur, globally=False) + + cur.execute("SELECT '[1,2,3]'::vector") + res = cur.fetchone() + assert np.array_equal(res[0], np.array([1, 2, 3])) + finally: + pool.putconn(conn) + + pool.closeall() From 706cebcb4c10f5fc6288757744fcfe94cb461a0b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:04:39 -0800 Subject: [PATCH 298/424] Improved test --- tests/test_psycopg2.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 85f08aa..3730eb8 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -101,11 +101,14 @@ def test_pool(self): conn = pool.getconn() try: - cur = conn.cursor() - # use globally=True for apps - register_vector(cur, globally=False) + register_vector(conn, globally=False) + finally: + pool.putconn(conn) + conn = pool.getconn() + try: + cur = conn.cursor() cur.execute("SELECT '[1,2,3]'::vector") res = cur.fetchone() assert np.array_equal(res[0], np.array([1, 2, 3])) From 812a85e7ce40d42382d84244b25e2f44eddf2e94 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:04:59 -0800 Subject: [PATCH 299/424] Improved test [skip ci] --- tests/test_psycopg2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 3730eb8..3f52385 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -97,7 +97,7 @@ def test_cursor_factory_connection(self): conn.close() def test_pool(self): - pool = ThreadedConnectionPool(1, 3, dbname='pgvector_python_test') + pool = ThreadedConnectionPool(1, 1, dbname='pgvector_python_test') conn = pool.getconn() try: From 07a3b2b6eec65d332041dcec136ac9c75291bc2b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 5 Nov 2024 12:09:23 -0800 Subject: [PATCH 300/424] Updated comment [skip ci] --- tests/test_psycopg2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 3f52385..c3cd3cd 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -101,7 +101,7 @@ def test_pool(self): conn = pool.getconn() try: - # use globally=True for apps + # use globally=True for apps to ensure registered with all connections register_vector(conn, globally=False) finally: pool.putconn(conn) From ea32504ef8538c781fab1f579fcaec7b417b7163 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 11 Nov 2024 20:56:23 -0800 Subject: [PATCH 301/424] Updated pgvector on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f8bcaa3..04f1c21 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.7.0 https://github.com/pgvector/pgvector.git + git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install From 664b8ee8692a42236ff9b236ec2da635342b96c7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 18:28:01 -0800 Subject: [PATCH 302/424] Added test for halfvec arrays with SQLAlchemy - #101 --- tests/test_sqlalchemy.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 8a032ef..c9aa900 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -32,6 +32,7 @@ class Item(Base): binary_embedding = mapped_column(BIT(3)) sparse_embedding = mapped_column(SPARSEVEC(3)) embeddings = mapped_column(ARRAY(VECTOR(3))) + half_embeddings = mapped_column(ARRAY(HALFVEC(3))) Base.metadata.drop_all(engine) @@ -447,6 +448,20 @@ def test_vector_array(self): assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + def test_halfvec_array(self): + session = Session(engine) + session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() + + with engine.connect() as connection: + from pgvector.psycopg2 import register_vector + register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) + + # this fails if the driver does not cast arrays + item = Session(bind=connection).get(Item, 1) + assert item.half_embeddings[0].to_list() == [1, 2, 3] + assert item.half_embeddings[1].to_list() == [4, 5, 6] + def test_half_precision(self): create_items() with Session(engine) as session: From 1c7e6a5fb3ea31512dacf71eaf4165eae9fa60e8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 19:04:21 -0800 Subject: [PATCH 303/424] Added docs for arrays with SQLAlchemy [skip ci] --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 94fed55..44f1d93 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,27 @@ order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2]) session.scalars(select(Item).order_by(order).limit(5)) ``` +#### Arrays + +Add an array column + +```python +from pgvector.sqlalchemy import Vector +from sqlalchemy import ARRAY + +class Item(Base): + embeddings = mapped_column(ARRAY(Vector(3))) +``` + +And register the types with the underlying driver + +```python +from pgvector.psycopg2 import register_vector + +with engine.connect() as connection: + register_vector(connection.connection.dbapi_connection, globally=True, arrays=True) +``` + ## SQLModel Enable the extension From 0a760663b1acd993c7caf364c8c087c50306a01f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 20:42:27 -0800 Subject: [PATCH 304/424] Use connection from session in example and tests --- README.md | 2 +- tests/test_sqlalchemy.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 44f1d93..757ade9 100644 --- a/README.md +++ b/README.md @@ -285,7 +285,7 @@ And register the types with the underlying driver ```python from pgvector.psycopg2 import register_vector -with engine.connect() as connection: +with session.connection() as connection: register_vector(connection.connection.dbapi_connection, globally=True, arrays=True) ``` diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index c9aa900..57cc12b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -439,12 +439,12 @@ def test_vector_array(self): session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with engine.connect() as connection: + with session.connection() as connection: from pgvector.psycopg2 import register_vector register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) # this fails if the driver does not cast arrays - item = Session(bind=connection).get(Item, 1) + item = session.get(Item, 1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] @@ -453,12 +453,12 @@ def test_halfvec_array(self): session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with engine.connect() as connection: + with session.connection() as connection: from pgvector.psycopg2 import register_vector register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) # this fails if the driver does not cast arrays - item = Session(bind=connection).get(Item, 1) + item = session.get(Item, 1) assert item.half_embeddings[0].to_list() == [1, 2, 3] assert item.half_embeddings[1].to_list() == [4, 5, 6] From 030def94b19329fa29c71f5273183f82c0550fd3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 20:55:15 -0800 Subject: [PATCH 305/424] Improved example and tests for arrays with SQLAlchemy - #101 [skip ci] --- README.md | 6 ++++-- tests/test_sqlalchemy.py | 39 ++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 757ade9..991c51f 100644 --- a/README.md +++ b/README.md @@ -284,9 +284,11 @@ And register the types with the underlying driver ```python from pgvector.psycopg2 import register_vector +from sqlalchemy import engine -with session.connection() as connection: - register_vector(connection.connection.dbapi_connection, globally=True, arrays=True) +@event.listens_for(engine, "connect") +def connect(dbapi_connection, connection_record): + register_vector(dbapi_connection, arrays=True) ``` ## SQLModel diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 57cc12b..f8e4bb1 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import numpy as np from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest -from sqlalchemy import create_engine, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY +from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY from sqlalchemy.exc import StatementError from sqlalchemy.ext.automap import automap_base from sqlalchemy.orm import declarative_base, Session @@ -20,6 +20,15 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() +array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') + + +@event.listens_for(array_engine, "connect") +def connect(dbapi_connection, connection_record): + from pgvector.psycopg2 import register_vector + register_vector(dbapi_connection, globally=False, arrays=True) + + Base = declarative_base() @@ -435,32 +444,24 @@ def test_automap(self): assert item.embedding.tolist() == [1, 2, 3] def test_vector_array(self): - session = Session(engine) + session = Session(array_engine) session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with session.connection() as connection: - from pgvector.psycopg2 import register_vector - register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) - - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] def test_halfvec_array(self): - session = Session(engine) + session = Session(array_engine) session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() - with session.connection() as connection: - from pgvector.psycopg2 import register_vector - register_vector(connection.connection.dbapi_connection, globally=False, arrays=True) - - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.half_embeddings[0].to_list() == [1, 2, 3] - assert item.half_embeddings[1].to_list() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.half_embeddings[0].to_list() == [1, 2, 3] + assert item.half_embeddings[1].to_list() == [4, 5, 6] def test_half_precision(self): create_items() From d23844ef10dcd4297a9e2f3671ed8e851e0a2db1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 21:00:22 -0800 Subject: [PATCH 306/424] Fixed example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 991c51f..516f3aa 100644 --- a/README.md +++ b/README.md @@ -284,7 +284,7 @@ And register the types with the underlying driver ```python from pgvector.psycopg2 import register_vector -from sqlalchemy import engine +from sqlalchemy import event @event.listens_for(engine, "connect") def connect(dbapi_connection, connection_record): From 04aa5bca2ee60c73de91507e5eb7472a6cf6d7a6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 22:39:46 -0800 Subject: [PATCH 307/424] Added test for arrays with SQLAlchemy async - #101 --- tests/test_sqlalchemy.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f8e4bb1..77c03fc 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -483,3 +483,25 @@ async def test_async(self): assert avg.first() == '[2.5,3.5,4.5]' await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_async_vector_array(self): + engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + @event.listens_for(engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) + + async with async_session() as session: + async with session.begin(): + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + + # this fails if the driver does not cast arrays + item = await session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] + + await engine.dispose() From dbc44f4533e9edaa376dc4d4a18fea235c5e2187 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 12 Nov 2024 22:48:46 -0800 Subject: [PATCH 308/424] Added more examples for arrays with SQLAlchemy [skip ci] --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 516f3aa..0dedce9 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,30 @@ class Item(Base): And register the types with the underlying driver +For Psycopg 3, use + +```python +from pgvector.psycopg import register_vector +from sqlalchemy import event + +@event.listens_for(engine, "connect") +def connect(dbapi_connection, connection_record): + register_vector(dbapi_connection) +``` + +For [async connections](https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html) with Psycopg 3, use + +```python +from pgvector.psycopg import register_vector_async +from sqlalchemy import event + +@event.listens_for(engine.sync_engine, "connect") +def connect(dbapi_connection, connection_record): + dbapi_connection.run_async(register_vector_async) +``` + +For Psycopg 2, use + ```python from pgvector.psycopg2 import register_vector from sqlalchemy import event From 368b363bbf9a48fe42bc114991c4e97ee140cdeb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:09:06 -0800 Subject: [PATCH 309/424] Added ColBERT example for binary embeddings [skip ci] --- examples/colbert/exact_binary.py | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 examples/colbert/exact_binary.py diff --git a/examples/colbert/exact_binary.py b/examples/colbert/exact_binary.py new file mode 100644 index 0000000..8d398e2 --- /dev/null +++ b/examples/colbert/exact_binary.py @@ -0,0 +1,53 @@ +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector, Bit +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embeddings bit(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + + +def binary_quantize(embeddings): + return [Bit(e.numpy()) for e in (embeddings > 0)] + + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + embeddings = binary_quantize(embeddings) + conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) + +query = 'puppy' +query_embeddings = binary_quantize(checkpoint.queryFromText([query])[0]) +result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) From 267d7960156b6866c300229a10b79b89d670ea39 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:41:25 -0800 Subject: [PATCH 310/424] Added ColPali / ColQwen2 example [skip ci] --- README.md | 1 + examples/colpali/exact.py | 52 +++++++++++++++++++++++++++++++ examples/colpali/requirements.txt | 4 +++ 3 files changed, 57 insertions(+) create mode 100644 examples/colpali/exact.py create mode 100644 examples/colpali/requirements.txt diff --git a/README.md b/README.md index 0dedce9..224fe57 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Or check out some examples: - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder) - [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers - [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT +- [Document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing - [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py new file mode 100644 index 0000000..408bc7f --- /dev/null +++ b/examples/colpali/exact.py @@ -0,0 +1,52 @@ +from colpali_engine.models import ColQwen2, ColQwen2Processor +from datasets import load_dataset +from pgvector.psycopg import register_vector +import psycopg +import torch + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings vector(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + + +device = 'mps' if torch.backends.mps.is_available() else 'cpu' +model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() +processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') + + +def generate_embeddings(processed): + with torch.no_grad(): + return model(**processed.to(model.device)).to(device='cpu', dtype=torch.float32) + + +input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] +for content in input: + embeddings = [e.numpy() for e in generate_embeddings(processor.process_images([content]))[0]] + conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) + +query = 'dividend' +query_embeddings = [e.numpy() for e in generate_embeddings(processor.process_queries([query]))[0]] +result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) diff --git a/examples/colpali/requirements.txt b/examples/colpali/requirements.txt new file mode 100644 index 0000000..4cf770d --- /dev/null +++ b/examples/colpali/requirements.txt @@ -0,0 +1,4 @@ +colpali-engine +datasets +pgvector +psycopg[binary] From 7d8a4173d988b5e9debaba7b4d6320d61879e76c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:47:10 -0800 Subject: [PATCH 311/424] Updated ColPali example to use binary quantization [skip ci] --- examples/colbert/exact_binary.py | 53 -------------------------------- examples/colpali/exact.py | 16 ++++++---- 2 files changed, 10 insertions(+), 59 deletions(-) delete mode 100644 examples/colbert/exact_binary.py diff --git a/examples/colbert/exact_binary.py b/examples/colbert/exact_binary.py deleted file mode 100644 index 8d398e2..0000000 --- a/examples/colbert/exact_binary.py +++ /dev/null @@ -1,53 +0,0 @@ -from colbert.infra import ColBERTConfig -from colbert.modeling.checkpoint import Checkpoint -from pgvector.psycopg import register_vector, Bit -import psycopg - -conn = psycopg.connect(dbname='pgvector_example', autocommit=True) - -conn.execute('CREATE EXTENSION IF NOT EXISTS vector') -register_vector(conn) - -conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embeddings bit(128)[])') -conn.execute(""" -CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ - WITH queries AS ( - SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) - ), - documents AS ( - SELECT unnest(document) AS document - ), - similarities AS ( - SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents - ), - max_similarities AS ( - SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number - ) - SELECT SUM(max_similarity) FROM max_similarities -$$ LANGUAGE SQL -""") - - -def binary_quantize(embeddings): - return [Bit(e.numpy()) for e in (embeddings > 0)] - - -config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) -checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) - -input = [ - 'The dog is barking', - 'The cat is purring', - 'The bear is growling' -] -doc_embeddings = checkpoint.docFromText(input, keep_dims=False) -for content, embeddings in zip(input, doc_embeddings): - embeddings = binary_quantize(embeddings) - conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) - -query = 'puppy' -query_embeddings = binary_quantize(checkpoint.queryFromText([query])[0]) -result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() -for row in result: - print(row) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 408bc7f..9fffc5f 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -1,6 +1,6 @@ from colpali_engine.models import ColQwen2, ColQwen2Processor from datasets import load_dataset -from pgvector.psycopg import register_vector +from pgvector.psycopg import register_vector, Bit import psycopg import torch @@ -10,9 +10,9 @@ register_vector(conn) conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings vector(128)[])') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings bit(128)[])') conn.execute(""" -CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ +CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ WITH queries AS ( SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) ), @@ -20,7 +20,7 @@ SELECT unnest(document) AS document ), similarities AS ( - SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents ), max_similarities AS ( SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number @@ -40,13 +40,17 @@ def generate_embeddings(processed): return model(**processed.to(model.device)).to(device='cpu', dtype=torch.float32) +def binary_quantize(embedding): + return Bit(embedding > 0) + + input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] for content in input: - embeddings = [e.numpy() for e in generate_embeddings(processor.process_images([content]))[0]] + embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_images([content]))[0]] conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) query = 'dividend' -query_embeddings = [e.numpy() for e in generate_embeddings(processor.process_queries([query]))[0]] +query_embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_queries([query]))[0]] result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() for row in result: print(row) From d73a412de5fcb6d225b4d90865f0c4e514a142d3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:54:52 -0800 Subject: [PATCH 312/424] Updated ColPali example to use get_torch_device [skip ci] --- examples/colpali/exact.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 9fffc5f..6eac7a4 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -1,4 +1,5 @@ from colpali_engine.models import ColQwen2, ColQwen2Processor +from colpali_engine.utils.torch_utils import get_torch_device from datasets import load_dataset from pgvector.psycopg import register_vector, Bit import psycopg @@ -30,7 +31,7 @@ """) -device = 'mps' if torch.backends.mps.is_available() else 'cpu' +device = get_torch_device('auto') model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') From 7b6a46a014144f05ba174a53510ed69fd113b100 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 15:55:26 -0800 Subject: [PATCH 313/424] Removed extra line [skip ci] --- examples/colpali/exact.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 6eac7a4..06d1828 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -30,7 +30,6 @@ $$ LANGUAGE SQL """) - device = get_torch_device('auto') model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') From 4998aa1daefe95eb7550bc92ed875d6193d73b57 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 16:19:48 -0800 Subject: [PATCH 314/424] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 224fe57..260d389 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Or check out some examples: - [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder) - [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers - [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT -- [Document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali +- [Visual document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch - [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing - [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit From af7b04f271612c3f663e9a508f9c44564272e3a8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 16:40:44 -0800 Subject: [PATCH 315/424] Updated example [skip ci] --- examples/colpali/exact.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index 06d1828..c6f1467 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -37,20 +37,20 @@ def generate_embeddings(processed): with torch.no_grad(): - return model(**processed.to(model.device)).to(device='cpu', dtype=torch.float32) + return model(**processed.to(model.device)) def binary_quantize(embedding): - return Bit(embedding > 0) + return Bit(embedding.gt(0).numpy(force=True)) input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] for content in input: - embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_images([content]))[0]] + embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_images([content]))[0]] conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) query = 'dividend' -query_embeddings = [binary_quantize(e.numpy()) for e in generate_embeddings(processor.process_queries([query]))[0]] +query_embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_queries([query]))[0]] result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() for row in result: print(row) From ca637bff37674592f08b9f65c75249b0d709746e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 16 Nov 2024 16:44:27 -0800 Subject: [PATCH 316/424] Updated example [skip ci] --- examples/colpali/exact.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py index c6f1467..80bb603 100644 --- a/examples/colpali/exact.py +++ b/examples/colpali/exact.py @@ -37,11 +37,11 @@ def generate_embeddings(processed): with torch.no_grad(): - return model(**processed.to(model.device)) + return model(**processed.to(model.device)).to(torch.float32).numpy(force=True) def binary_quantize(embedding): - return Bit(embedding.gt(0).numpy(force=True)) + return Bit(embedding > 0) input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] From 5c35a5399aa3101f35e09e941a4e7cce0218e1ef Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 30 Nov 2024 03:30:34 -0800 Subject: [PATCH 317/424] Added test for binary quantization with SQLAlchemy - #98 [skip ci] --- tests/test_sqlalchemy.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 77c03fc..0380c89 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -65,6 +65,14 @@ class Item(Base): ) half_precision_index.create(engine) +binary_quantize_index = Index( + 'sqlalchemy_orm_binary_quantize_index', + func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'bit_hamming_ops'} +) +binary_quantize_index.create(engine) def create_items(): session = Session(engine) @@ -469,6 +477,18 @@ def test_half_precision(self): items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] + def test_binary_quantize(self): + session = Session(engine) + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + + with Session(engine) as session: + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) + items = session.query(Item).order_by(distance).all() + assert [v.id for v in items] == [2, 3, 1] + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_async(self): From 230fe853d58105df1951fbbbc1730469b341f056 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 19 Dec 2024 11:59:52 -0800 Subject: [PATCH 318/424] Fixed spacing [skip ci] --- tests/test_sqlalchemy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0380c89..9ab706a 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -74,6 +74,7 @@ class Item(Base): ) binary_quantize_index.create(engine) + def create_items(): session = Session(engine) session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) From 057806e44f6943230699d99c742621baeb9023c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 19 Dec 2024 12:00:52 -0800 Subject: [PATCH 319/424] Added test for bit type with SQLAlchemy and asyncpg - #110 --- tests/test_sqlalchemy.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 9ab706a..0b53252 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -526,3 +526,20 @@ def connect(dbapi_connection, connection_record): assert item.embeddings[1].tolist() == [4, 5, 6] await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_bit(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + embedding = asyncpg.BitString('101') + session.add(Item(id=1, binary_embedding=embedding)) + item = await session.get(Item, 1) + assert item.binary_embedding == embedding + + await engine.dispose() From 57b6a61149c1f009ae55cccc17a9b5900e335f72 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 19 Dec 2024 12:06:02 -0800 Subject: [PATCH 320/424] Improved asyncpg test [skip ci] --- tests/test_asyncpg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 7a68a9e..48d1e32 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -59,10 +59,11 @@ async def test_bit(self): await register_vector(conn) - embedding = asyncpg.BitString.from_int(5, length=3) + embedding = asyncpg.BitString('101') await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert res[0]['embedding'].as_string() == '101' assert res[0]['embedding'].to_int() == 5 assert res[1]['embedding'] is None From 47ad76d88f72cf07ffa238e4ad2714b672346149 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:35:43 -0800 Subject: [PATCH 321/424] Improved SQLModel example --- README.md | 3 +-- tests/test_sqlmodel.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 260d389..ca399ea 100644 --- a/README.md +++ b/README.md @@ -328,10 +328,9 @@ Add a vector column ```python from pgvector.sqlalchemy import Vector -from sqlalchemy import Column class Item(SQLModel, table=True): - embedding: Any = Field(sa_column=Column(Vector(3))) + embedding: Any = Field(sa_type=Vector(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 4cb0e9b..8a1c86c 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -15,10 +15,10 @@ class Item(SQLModel, table=True): __tablename__ = 'sqlmodel_item' id: Optional[int] = Field(default=None, primary_key=True) - embedding: Optional[Any] = Field(default=None, sa_column=Column(VECTOR(3))) - half_embedding: Optional[Any] = Field(default=None, sa_column=Column(HALFVEC(3))) - binary_embedding: Optional[Any] = Field(default=None, sa_column=Column(BIT(3))) - sparse_embedding: Optional[Any] = Field(default=None, sa_column=Column(SPARSEVEC(3))) + embedding: Optional[Any] = Field(default=None, sa_type=VECTOR(3)) + half_embedding: Optional[Any] = Field(default=None, sa_type=HALFVEC(3)) + binary_embedding: Optional[Any] = Field(default=None, sa_type=BIT(3)) + sparse_embedding: Optional[Any] = Field(default=None, sa_type=SPARSEVEC(3)) SQLModel.metadata.drop_all(engine) From b3e8908d3b3d74eba016b1cdc5bc7b1df1ad92bf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:36:07 -0800 Subject: [PATCH 322/424] Removed unneeded code [skip ci] --- tests/test_django.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 353087e..2c53d82 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -86,9 +86,6 @@ class Meta: class Migration(migrations.Migration): initial = True - dependencies = [ - ] - operations = [ VectorExtension(), migrations.CreateModel( From edd9b4ba02160ef429c4e44455eb0bfe6c781092 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:37:05 -0800 Subject: [PATCH 323/424] Test with Python 3.13 on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 04f1c21..562ba94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - python: [3.12, 3.8] + python: [3.13, 3.8] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 From 1b9df46f9542f3262f6c93a1a858c1414a0ffdc5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:38:38 -0800 Subject: [PATCH 324/424] Improved test code [skip ci] --- tests/test_sqlmodel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 8a1c86c..851afd8 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,9 +1,8 @@ import numpy as np from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest -from sqlalchemy import Column, Index from sqlalchemy.exc import StatementError -from sqlmodel import Field, Session, SQLModel, create_engine, delete, select, text +from sqlmodel import Field, Index, Session, SQLModel, create_engine, delete, select, text from typing import Any, Optional engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') From a3c611f3f141a00c42b311f387278bb4f3ee4bcf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 19:42:16 -0800 Subject: [PATCH 325/424] Updated examples [skip ci] --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ca399ea..d7a7e6c 100644 --- a/README.md +++ b/README.md @@ -175,10 +175,10 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR class Item(Base): - embedding = mapped_column(Vector(3)) + embedding = mapped_column(VECTOR(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` @@ -274,11 +274,11 @@ session.scalars(select(Item).order_by(order).limit(5)) Add an array column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR from sqlalchemy import ARRAY class Item(Base): - embeddings = mapped_column(ARRAY(Vector(3))) + embeddings = mapped_column(ARRAY(VECTOR(3))) ``` And register the types with the underlying driver @@ -327,10 +327,10 @@ session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR class Item(SQLModel, table=True): - embedding: Any = Field(sa_type=Vector(3)) + embedding: Any = Field(sa_type=VECTOR(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` From 2ba2a855164f6f0947f17b94201a46d5ad615e6c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 20:00:48 -0800 Subject: [PATCH 326/424] Improved example [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d7a7e6c..794cf91 100644 --- a/README.md +++ b/README.md @@ -376,7 +376,7 @@ Also supports `sum` Add an approximate index ```python -from sqlalchemy import Index +from sqlmodel import Index index = Index( 'my_index', From c6d2ddd429c10316ef329dd07ab86fe192bc71a0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 22:09:23 -0800 Subject: [PATCH 327/424] Improved sparsevec tests [skip ci] --- pgvector/utils/sparsevec.py | 1 + tests/test_psycopg.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index fd9ccff..a370c5e 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -108,6 +108,7 @@ def from_binary(cls, value): dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) + # TODO convert indices and values to lists in 0.4.0 return cls._from_parts(int(dim), indices, values) @classmethod diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 5802b2b..6d4f34a 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -110,12 +110,19 @@ def test_sparsevec(self): def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] + assert res.dimensions() == 6 + # TODO convert indices and values to lists in 0.4.0 + assert res.indices() == (0, 2, 4) + assert res.values() == (1.5, 2, 3) assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] + assert res.dimensions() == 6 + assert res.indices() == [0, 2, 4] + assert res.values() == [1.5, 2, 3] assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) From 9d9f45b800f3731e213f7b06bf3374e177ad86d5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 22:11:22 -0800 Subject: [PATCH 328/424] Added todo [skip ci] --- pgvector/psycopg2/register.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 7752852..08a69a9 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,6 +5,7 @@ from .vector import register_vector_info +# TODO remove default value for conn_or_curs in 0.4.0 # TODO make globally False by default in 0.4.0 # note: register_adapter is always global # TODO make arrays True by defalt in 0.4.0 From 972b6739788f5a09ec270bed552182a052e994c5 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 22:18:26 -0800 Subject: [PATCH 329/424] Updated license year [skip ci] --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index d205f4e..b612d6d 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2021-2024 Andrew Kane +Copyright (c) 2021-2025 Andrew Kane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 57b7d3ba12781871045a378221d90bc972a3d5c1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:26:02 -0800 Subject: [PATCH 330/424] Added test for vector type with SQLAlchemy and asyncpg - #114 --- tests/test_sqlalchemy.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0b53252..6fc0adf 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -527,6 +527,29 @@ def connect(dbapi_connection, connection_record): await engine.dispose() + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_vector(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + embedding = np.array([1, 2, 3]) + session.add(Item(id=1, embedding=embedding)) + item = await session.get(Item, 1) + assert np.array_equal(item.embedding, embedding) + + await engine.dispose() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_bit(self): From bf9a0a469983eabb1b1b38c6ba2495e3c4c2b8ce Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:29:37 -0800 Subject: [PATCH 331/424] Added tests for halfvec and sparsevec types with SQLAlchemy and asyncpg [skip ci] --- tests/test_sqlalchemy.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 6fc0adf..40068e9 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -550,6 +550,29 @@ async def test_asyncpg_vector(self): await engine.dispose() + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_halfvec(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + embedding = [1, 2, 3] + session.add(Item(id=1, half_embedding=embedding)) + item = await session.get(Item, 1) + assert item.half_embedding.to_list() == embedding + + await engine.dispose() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_bit(self): @@ -566,3 +589,26 @@ async def test_asyncpg_bit(self): assert item.binary_embedding == embedding await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_sparsevec(self): + import asyncpg + + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + embedding = [1, 2, 3] + session.add(Item(id=1, sparse_embedding=embedding)) + item = await session.get(Item, 1) + assert item.sparse_embedding.to_list() == embedding + + await engine.dispose() From 257eb3b92c9f02e2ca266a15c6c8b93ebc94082a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:30:55 -0800 Subject: [PATCH 332/424] Simplified tests [skip ci] --- tests/test_sqlalchemy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 40068e9..519a388 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -530,8 +530,6 @@ def connect(dbapi_connection, connection_record): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_vector(self): - import asyncpg - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -553,8 +551,6 @@ async def test_asyncpg_vector(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_halfvec(self): - import asyncpg - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -593,8 +589,6 @@ async def test_asyncpg_bit(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_sparsevec(self): - import asyncpg - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From 91f5d34c11f0064c83ca08b7e69055ce6ef03124 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:32:53 -0800 Subject: [PATCH 333/424] Added test for vector[] type with SQLAlchemy and asyncpg [skip ci] --- tests/test_sqlalchemy.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 519a388..7e8b888 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -606,3 +606,26 @@ async def test_asyncpg_sparsevec(self): assert item.sparse_embedding.to_list() == embedding await engine.dispose() + + @pytest.mark.asyncio + @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') + async def test_asyncpg_vector_array(self): + engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_session = async_sessionmaker(engine, expire_on_commit=False) + + # TODO do not throw error when types are registered + # @event.listens_for(engine.sync_engine, "connect") + # def connect(dbapi_connection, connection_record): + # from pgvector.asyncpg import register_vector + # dbapi_connection.run_async(register_vector) + + async with async_session() as session: + async with session.begin(): + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + + # this fails if the driver does not cast arrays + item = await session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] + + await engine.dispose() From f7eeb3a04554b9adf82a5073d08fc757c41604a3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:36:40 -0800 Subject: [PATCH 334/424] Improved test code [skip ci] --- tests/test_sqlalchemy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 7e8b888..f3d045f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -492,7 +492,7 @@ def test_binary_quantize(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_async(self): + async def test_async_avg(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -622,8 +622,6 @@ async def test_asyncpg_vector_array(self): async with async_session() as session: async with session.begin(): session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - - # this fails if the driver does not cast arrays item = await session.get(Item, 1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] From 2d2563d702ee319a33d17b27549bce035a6c7348 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 8 Feb 2025 23:39:55 -0800 Subject: [PATCH 335/424] Improved test names [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f3d045f..fd46e74 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -492,7 +492,7 @@ def test_binary_quantize(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_async_avg(self): + async def test_psycopg_async_avg(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -507,7 +507,7 @@ async def test_async_avg(self): @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_async_vector_array(self): + async def test_psycopg_async_vector_array(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From d828239fb466e11a8fb02c7e35a052dbbce3e5b8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 01:00:50 -0800 Subject: [PATCH 336/424] Revert "Updated examples [skip ci]" This reverts commit a3c611f3f141a00c42b311f387278bb4f3ee4bcf. --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 794cf91..5a59c9d 100644 --- a/README.md +++ b/README.md @@ -175,10 +175,10 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import VECTOR +from pgvector.sqlalchemy import Vector class Item(Base): - embedding = mapped_column(VECTOR(3)) + embedding = mapped_column(Vector(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` @@ -274,11 +274,11 @@ session.scalars(select(Item).order_by(order).limit(5)) Add an array column ```python -from pgvector.sqlalchemy import VECTOR +from pgvector.sqlalchemy import Vector from sqlalchemy import ARRAY class Item(Base): - embeddings = mapped_column(ARRAY(VECTOR(3))) + embeddings = mapped_column(ARRAY(Vector(3))) ``` And register the types with the underlying driver @@ -327,10 +327,10 @@ session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import VECTOR +from pgvector.sqlalchemy import Vector class Item(SQLModel, table=True): - embedding: Any = Field(sa_type=VECTOR(3)) + embedding: Any = Field(sa_type=Vector(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` From 8a7040d2ee79ac8fc6313538ffbc38ebad3ac197 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 01:34:02 -0800 Subject: [PATCH 337/424] Removed unneeded code [skip ci] --- examples/citus/example.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/citus/example.py b/examples/citus/example.py index d448204..915c25f 100644 --- a/examples/citus/example.py +++ b/examples/citus/example.py @@ -40,9 +40,6 @@ for i in range(rows): copy.write_row([embeddings[i], categories[i]]) - while conn.pgconn.flush() == 1: - pass - print('Creating index in parallel') conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') From 00cd08e6c44077b99f378edbd007b2483ff406f7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:09:23 -0800 Subject: [PATCH 338/424] Improved tests --- tests/test_sqlalchemy.py | 139 ++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 68 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index fd46e74..405cd21 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -76,11 +76,11 @@ class Item(Base): def create_items(): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) - session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) - session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) - session.commit() + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) + session.commit() class TestSqlalchemy: @@ -129,11 +129,11 @@ def test_orm(self): item2 = Item(embedding=[4, 5, 6]) item3 = Item() - session = Session(engine) - session.add(item) - session.add(item2) - session.add(item3) - session.commit() + with Session(engine) as session: + session.add(item) + session.add(item2) + session.add(item3) + session.commit() stmt = select(Item) with Session(engine) as session: @@ -148,11 +148,11 @@ def test_orm(self): assert items[2].embedding is None def test_vector(self): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] def test_vector_l2_distance(self): create_items() @@ -203,11 +203,11 @@ def test_vector_l1_distance_orm(self): assert [v.id for v in items] == [1, 3, 2] def test_halfvec(self): - session = Session(engine) - session.add(Item(id=1, half_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] def test_halfvec_l2_distance(self): create_items() @@ -258,11 +258,11 @@ def test_halfvec_l1_distance_orm(self): assert [v.id for v in items] == [1, 3, 2] def test_bit(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' + with Session(engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' def test_bit_hamming_distance(self): create_items() @@ -289,11 +289,11 @@ def test_bit_jaccard_distance_orm(self): assert [v.id for v in items] == [2, 3, 1] def test_sparsevec(self): - session = Session(engine) - session.add(Item(id=1, sparse_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() @@ -405,24 +405,24 @@ def test_sum_orm(self): def test_bad_dimensions(self): item = Item(embedding=[1, 2]) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() def test_bad_ndim(self): item = Item(embedding=np.array([[1, 2, 3]])) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected ndim to be 1'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected ndim to be 1'): + session.commit() def test_bad_dtype(self): item = Item(embedding=np.array(['one', 'two', 'three'])) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='could not convert string to float'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='could not convert string to float'): + session.commit() def test_inspect(self): columns = inspect(engine).get_columns('sqlalchemy_orm_item') @@ -433,14 +433,17 @@ def test_literal_binds(self): assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) def test_insert(self): - session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) + with Session(engine) as session: + session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) def test_insert_bulk(self): - session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) + with Session(engine) as session: + session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) # register_vector in psycopg2 tests change this behavior # def test_insert_text(self): - # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + # with Session(engine) as session: + # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) def test_automap(self): metadata = MetaData() @@ -448,29 +451,30 @@ def test_automap(self): AutoBase = automap_base(metadata=metadata) AutoBase.prepare() AutoItem = AutoBase.classes.sqlalchemy_orm_item - session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) - item = session.query(AutoItem).first() - assert item.embedding.tolist() == [1, 2, 3] + with Session(engine) as session: + session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) + item = session.query(AutoItem).first() + assert item.embedding.tolist() == [1, 2, 3] def test_vector_array(self): - session = Session(array_engine) - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - session.commit() + with Session(array_engine) as session: + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] def test_halfvec_array(self): - session = Session(array_engine) - session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - session.commit() + with Session(array_engine) as session: + session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() - # this fails if the driver does not cast arrays - item = session.get(Item, 1) - assert item.half_embeddings[0].to_list() == [1, 2, 3] - assert item.half_embeddings[1].to_list() == [4, 5, 6] + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.half_embeddings[0].to_list() == [1, 2, 3] + assert item.half_embeddings[1].to_list() == [4, 5, 6] def test_half_precision(self): create_items() @@ -479,13 +483,12 @@ def test_half_precision(self): assert [v.id for v in items] == [1, 3, 2] def test_binary_quantize(self): - session = Session(engine) - session.add(Item(id=1, embedding=[-1, -2, -3])) - session.add(Item(id=2, embedding=[1, -2, 3])) - session.add(Item(id=3, embedding=[1, 2, 3])) - session.commit() - with Session(engine) as session: + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] From 7837e92d72eef265e075e8ea5aa305e159e41437 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:15:18 -0800 Subject: [PATCH 339/424] Added more tests for SQLAlchemy --- tests/test_sqlalchemy.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 405cd21..79b3c50 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -29,6 +29,8 @@ def connect(dbapi_connection, connection_record): register_vector(dbapi_connection, globally=False, arrays=True) +psycopg3_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + Base = declarative_base() @@ -493,6 +495,34 @@ def test_binary_quantize(self): items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] + def test_psycopg_vector(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] + + def test_psycopg_halfvec(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] + + def test_psycopg_bit(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + + def test_psycopg_sparsevec(self): + with Session(psycopg3_engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_list() == [1, 2, 3] + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_psycopg_async_avg(self): From f08cec7a0522b19942a02df14f3f396f0773c912 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:46:42 -0800 Subject: [PATCH 340/424] Parameterize SQLAlchemy tests --- tests/test_sqlalchemy.py | 181 ++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 97 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 79b3c50..a4ac860 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -15,8 +15,15 @@ mapped_column = Column sqlalchemy_version = 1 -engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -with Session(engine) as session: +psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +engines = [psycopg2_engine] + +if sqlalchemy_version > 1: + psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + engines.append(psycopg_engine) + +setup_engine = engines[0] +with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() @@ -29,8 +36,6 @@ def connect(dbapi_connection, connection_record): register_vector(dbapi_connection, globally=False, arrays=True) -psycopg3_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') - Base = declarative_base() @@ -46,8 +51,8 @@ class Item(Base): half_embeddings = mapped_column(ARRAY(HALFVEC(3))) -Base.metadata.drop_all(engine) -Base.metadata.create_all(engine) +Base.metadata.drop_all(setup_engine) +Base.metadata.create_all(setup_engine) index = Index( 'sqlalchemy_orm_index', @@ -56,7 +61,7 @@ class Item(Base): postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_l2_ops'} ) -index.create(engine) +index.create(setup_engine) half_precision_index = Index( 'sqlalchemy_orm_half_precision_index', @@ -65,7 +70,7 @@ class Item(Base): postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'halfvec_l2_ops'} ) -half_precision_index.create(engine) +half_precision_index.create(setup_engine) binary_quantize_index = Index( 'sqlalchemy_orm_binary_quantize_index', @@ -74,24 +79,29 @@ class Item(Base): postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'bit_hamming_ops'} ) -binary_quantize_index.create(engine) +binary_quantize_index.create(setup_engine) def create_items(): - with Session(engine) as session: + with Session(setup_engine) as session: session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) session.commit() +def delete_items(): + with Session(setup_engine) as session: + session.query(Item).delete() + session.commit() + + +@pytest.mark.parametrize("engine", engines) class TestSqlalchemy: - def setup_method(self, test_method): - with Session(engine) as session: - session.query(Item).delete() - session.commit() + def setup_method(self): + delete_items() - def test_core(self): + def test_core(self, engine): metadata = MetaData() item_table = Table( @@ -126,7 +136,7 @@ def test_core(self): ) hnsw_index.create(engine) - def test_orm(self): + def test_orm(self, engine): item = Item(embedding=np.array([1.5, 2, 3])) item2 = Item(embedding=[4, 5, 6]) item3 = Item() @@ -140,236 +150,236 @@ def test_orm(self): stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id == 1 - assert items[1].id == 2 - assert items[2].id == 3 + assert items[0].id in [1, 4] + assert items[1].id in [2, 5] + assert items[2].id in [3, 6] assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None - def test_vector(self): + def test_vector(self, engine): with Session(engine) as session: session.add(Item(id=1, embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) assert item.embedding.tolist() == [1, 2, 3] - def test_vector_l2_distance(self): + def test_vector_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_vector_l2_distance_orm(self): + def test_vector_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_vector_max_inner_product(self): + def test_vector_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_vector_max_inner_product_orm(self): + def test_vector_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_vector_cosine_distance(self): + def test_vector_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_vector_cosine_distance_orm(self): + def test_vector_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_vector_l1_distance(self): + def test_vector_l1_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_vector_l1_distance_orm(self): + def test_vector_l1_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_halfvec(self): + def test_halfvec(self, engine): with Session(engine) as session: session.add(Item(id=1, half_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) assert item.half_embedding.to_list() == [1, 2, 3] - def test_halfvec_l2_distance(self): + def test_halfvec_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_halfvec_l2_distance_orm(self): + def test_halfvec_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_halfvec_max_inner_product(self): + def test_halfvec_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_halfvec_max_inner_product_orm(self): + def test_halfvec_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_halfvec_cosine_distance(self): + def test_halfvec_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_halfvec_cosine_distance_orm(self): + def test_halfvec_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_halfvec_l1_distance(self): + def test_halfvec_l1_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_halfvec_l1_distance_orm(self): + def test_halfvec_l1_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_bit(self): + def test_bit(self, engine): with Session(engine) as session: session.add(Item(id=1, binary_embedding='101')) session.commit() item = session.get(Item, 1) assert item.binary_embedding == '101' - def test_bit_hamming_distance(self): + def test_bit_hamming_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] - def test_bit_hamming_distance_orm(self): + def test_bit_hamming_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) assert [v.id for v in items] == [2, 3, 1] - def test_bit_jaccard_distance(self): + def test_bit_jaccard_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] - def test_bit_jaccard_distance_orm(self): + def test_bit_jaccard_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) assert [v.id for v in items] == [2, 3, 1] - def test_sparsevec(self): + def test_sparsevec(self, engine): with Session(engine) as session: session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) assert item.sparse_embedding.to_list() == [1, 2, 3] - def test_sparsevec_l2_distance(self): + def test_sparsevec_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_sparsevec_l2_distance_orm(self): + def test_sparsevec_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_sparsevec_max_inner_product(self): + def test_sparsevec_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_sparsevec_max_inner_product_orm(self): + def test_sparsevec_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_sparsevec_cosine_distance(self): + def test_sparsevec_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_sparsevec_cosine_distance_orm(self): + def test_sparsevec_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_sparsevec_l1_distance(self): + def test_sparsevec_l1_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_sparsevec_l1_distance_orm(self): + def test_sparsevec_l1_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_filter(self): + def test_filter(self, engine): create_items() with Session(engine) as session: items = session.query(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1).all() assert [v.id for v in items] == [1] - def test_filter_orm(self): + def test_filter_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1)) assert [v.id for v in items] == [1] - def test_select(self): + def test_select(self, engine): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) items = session.query(Item.embedding.l2_distance([1, 1, 1])).first() assert items[0] == 3 - def test_select_orm(self): + def test_select_orm(self, engine): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) items = session.scalars(select(Item.embedding.l2_distance([1, 1, 1]))).all() assert items[0] == 3 - def test_avg(self): + def test_avg(self, engine): with Session(engine) as session: res = session.query(avg(Item.embedding)).first()[0] assert res is None @@ -378,7 +388,7 @@ def test_avg(self): res = session.query(avg(Item.embedding)).first()[0] assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) - def test_avg_orm(self): + def test_avg_orm(self, engine): with Session(engine) as session: res = session.scalars(select(avg(Item.embedding))).first() assert res is None @@ -387,7 +397,7 @@ def test_avg_orm(self): res = session.scalars(select(avg(Item.embedding))).first() assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) - def test_sum(self): + def test_sum(self, engine): with Session(engine) as session: res = session.query(sum(Item.embedding)).first()[0] assert res is None @@ -396,7 +406,7 @@ def test_sum(self): res = session.query(sum(Item.embedding)).first()[0] assert np.array_equal(res, np.array([5, 7, 9])) - def test_sum_orm(self): + def test_sum_orm(self, engine): with Session(engine) as session: res = session.scalars(select(sum(Item.embedding))).first() assert res is None @@ -405,40 +415,40 @@ def test_sum_orm(self): res = session.scalars(select(sum(Item.embedding))).first() assert np.array_equal(res, np.array([5, 7, 9])) - def test_bad_dimensions(self): + def test_bad_dimensions(self, engine): item = Item(embedding=[1, 2]) with Session(engine) as session: session.add(item) with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): session.commit() - def test_bad_ndim(self): + def test_bad_ndim(self, engine): item = Item(embedding=np.array([[1, 2, 3]])) with Session(engine) as session: session.add(item) with pytest.raises(StatementError, match='expected ndim to be 1'): session.commit() - def test_bad_dtype(self): + def test_bad_dtype(self, engine): item = Item(embedding=np.array(['one', 'two', 'three'])) with Session(engine) as session: session.add(item) with pytest.raises(StatementError, match='could not convert string to float'): session.commit() - def test_inspect(self): + def test_inspect(self, engine): columns = inspect(engine).get_columns('sqlalchemy_orm_item') assert isinstance(columns[1]['type'], VECTOR) - def test_literal_binds(self): + def test_literal_binds(self, engine): sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(engine, compile_kwargs={'literal_binds': True}) assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) - def test_insert(self): + def test_insert(self, engine): with Session(engine) as session: session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) - def test_insert_bulk(self): + def test_insert_bulk(self, engine): with Session(engine) as session: session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) @@ -447,7 +457,7 @@ def test_insert_bulk(self): # with Session(engine) as session: # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) - def test_automap(self): + def test_automap(self, engine): metadata = MetaData() metadata.reflect(engine, only=['sqlalchemy_orm_item']) AutoBase = automap_base(metadata=metadata) @@ -458,7 +468,7 @@ def test_automap(self): item = session.query(AutoItem).first() assert item.embedding.tolist() == [1, 2, 3] - def test_vector_array(self): + def test_vector_array(self, engine): with Session(array_engine) as session: session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -468,7 +478,7 @@ def test_vector_array(self): assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] - def test_halfvec_array(self): + def test_halfvec_array(self, engine): with Session(array_engine) as session: session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -478,13 +488,13 @@ def test_halfvec_array(self): assert item.half_embeddings[0].to_list() == [1, 2, 3] assert item.half_embeddings[1].to_list() == [4, 5, 6] - def test_half_precision(self): + def test_half_precision(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_binary_quantize(self): + def test_binary_quantize(self, engine): with Session(engine) as session: session.add(Item(id=1, embedding=[-1, -2, -3])) session.add(Item(id=2, embedding=[1, -2, 3])) @@ -495,33 +505,10 @@ def test_binary_quantize(self): items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] - def test_psycopg_vector(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] - - def test_psycopg_halfvec(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, half_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] - def test_psycopg_bit(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' - - def test_psycopg_sparsevec(self): - with Session(psycopg3_engine) as session: - session.add(Item(id=1, sparse_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] +class TestSqlalchemyAsync: + def setup_method(self): + delete_items() @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') From 5e381602a739ca5307f02c75ee57d219555f5ada Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 11:54:57 -0800 Subject: [PATCH 341/424] Improved array tests --- tests/test_sqlalchemy.py | 49 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index a4ac860..b1f3e85 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -27,10 +27,11 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() -array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +array_engines = [psycopg2_array_engine] -@event.listens_for(array_engine, "connect") +@event.listens_for(psycopg2_array_engine, "connect") def connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector register_vector(dbapi_connection, globally=False, arrays=True) @@ -468,8 +469,31 @@ def test_automap(self, engine): item = session.query(AutoItem).first() assert item.embedding.tolist() == [1, 2, 3] + def test_half_precision(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_binary_quantize(self, engine): + with Session(engine) as session: + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) + items = session.query(Item).order_by(distance).all() + assert [v.id for v in items] == [2, 3, 1] + + +@pytest.mark.parametrize("engine", array_engines) +class TestSqlalchemyArray: + def setup_method(self): + delete_items() + def test_vector_array(self, engine): - with Session(array_engine) as session: + with Session(engine) as session: session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -479,7 +503,7 @@ def test_vector_array(self, engine): assert item.embeddings[1].tolist() == [4, 5, 6] def test_halfvec_array(self, engine): - with Session(array_engine) as session: + with Session(engine) as session: session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) session.commit() @@ -488,23 +512,6 @@ def test_halfvec_array(self, engine): assert item.half_embeddings[0].to_list() == [1, 2, 3] assert item.half_embeddings[1].to_list() == [4, 5, 6] - def test_half_precision(self, engine): - create_items() - with Session(engine) as session: - items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() - assert [v.id for v in items] == [1, 3, 2] - - def test_binary_quantize(self, engine): - with Session(engine) as session: - session.add(Item(id=1, embedding=[-1, -2, -3])) - session.add(Item(id=2, embedding=[1, -2, 3])) - session.add(Item(id=3, embedding=[1, 2, 3])) - session.commit() - - distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) - items = session.query(Item).order_by(distance).all() - assert [v.id for v in items] == [2, 3, 1] - class TestSqlalchemyAsync: def setup_method(self): From f82e44f231e498c86839735de1658ef7b8cb11a1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:01:45 -0800 Subject: [PATCH 342/424] Added tests for SQLAlchemy with pg8000 --- requirements.txt | 1 + tests/test_sqlalchemy.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0e30959..a13be06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ asyncpg Django numpy peewee +pg8000 psycopg[binary,pool] psycopg2-binary pytest diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index b1f3e85..37e803d 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,4 +1,5 @@ import numpy as np +import os from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY @@ -16,7 +17,8 @@ sqlalchemy_version = 1 psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -engines = [psycopg2_engine] +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ['USER']}@localhost/pgvector_python_test') +engines = [psycopg2_engine, pg8000_engine] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') @@ -151,9 +153,9 @@ def test_orm(self, engine): stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id in [1, 4] - assert items[1].id in [2, 5] - assert items[2].id in [3, 6] + assert items[0].id in [1, 4, 7] + assert items[1].id in [2, 5, 8] + assert items[2].id in [3, 6, 9] assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) @@ -290,12 +292,18 @@ def test_bit_hamming_distance_orm(self, engine): assert [v.id for v in items] == [2, 3, 1] def test_bit_jaccard_distance(self, engine): + if engine == pg8000_engine: + return + create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() assert [v.id for v in items] == [2, 3, 1] def test_bit_jaccard_distance_orm(self, engine): + if engine == pg8000_engine: + return + create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) From cfcc2ea7b8b942c47c378bf47a4490c5acb50ec7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:02:44 -0800 Subject: [PATCH 343/424] Updated style [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 37e803d..aa2ad97 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -99,7 +99,7 @@ def delete_items(): session.commit() -@pytest.mark.parametrize("engine", engines) +@pytest.mark.parametrize('engine', engines) class TestSqlalchemy: def setup_method(self): delete_items() @@ -495,7 +495,7 @@ def test_binary_quantize(self, engine): assert [v.id for v in items] == [2, 3, 1] -@pytest.mark.parametrize("engine", array_engines) +@pytest.mark.parametrize('engine', array_engines) class TestSqlalchemyArray: def setup_method(self): delete_items() From 95403d5268e11ab6efef969f46f086e3f57e2b52 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:05:34 -0800 Subject: [PATCH 344/424] Added tests for arrays with SQLAlchemy and Psycopg 3 --- tests/test_sqlalchemy.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index aa2ad97..f4a6bce 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -34,11 +34,21 @@ @event.listens_for(psycopg2_array_engine, "connect") -def connect(dbapi_connection, connection_record): +def psycopg2_connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector register_vector(dbapi_connection, globally=False, arrays=True) +if sqlalchemy_version > 1: + psycopg_array_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + array_engines.append(psycopg_array_engine) + + @event.listens_for(psycopg_array_engine, "connect") + def psycopg_connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector + register_vector(dbapi_connection) + + Base = declarative_base() From c74e090f26a02fc920ef910265ac0e4f2eb7cbde Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:11:46 -0800 Subject: [PATCH 345/424] Fixed CI --- tests/test_sqlalchemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index f4a6bce..7dbc565 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -17,7 +17,7 @@ sqlalchemy_version = 1 psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ['USER']}@localhost/pgvector_python_test') +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') engines = [psycopg2_engine, pg8000_engine] if sqlalchemy_version > 1: From b350d6a8d45d02ea954fad945d194896c50fbc1e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:13:46 -0800 Subject: [PATCH 346/424] Simplified test code [skip ci] --- tests/test_django.py | 2 +- tests/test_peewee.py | 2 +- tests/test_psycopg.py | 2 +- tests/test_psycopg2.py | 2 +- tests/test_sqlmodel.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 2c53d82..ea15771 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -158,7 +158,7 @@ class Meta: class TestDjango: - def setup_method(self, test_method): + def setup_method(self): Item.objects.all().delete() def test_vector(self): diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 9666388..e98a0ec 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -36,7 +36,7 @@ def create_items(): class TestPeewee: - def setup_method(self, test_method): + def setup_method(self): Item.truncate_table() def test_vector(self): diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 6d4f34a..90f80b6 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -14,7 +14,7 @@ class TestPsycopg: - def setup_method(self, test_method): + def setup_method(self): conn.execute('DELETE FROM psycopg_items') def test_vector(self): diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index c3cd3cd..d661f12 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -16,7 +16,7 @@ class TestPsycopg2: - def setup_method(self, test_method): + def setup_method(self): cur.execute('DELETE FROM psycopg2_items') def test_vector(self): diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 851afd8..e0330d2 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -42,7 +42,7 @@ def create_items(): class TestSqlmodel: - def setup_method(self, test_method): + def setup_method(self): with Session(engine) as session: session.exec(delete(Item)) session.commit() From 651df0844b3c6790414ec2e8ed75330ad80406af Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:15:52 -0800 Subject: [PATCH 347/424] Improved SQLModel tests [skip ci] --- tests/test_sqlmodel.py | 68 +++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index e0330d2..373834f 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -34,11 +34,11 @@ class Item(SQLModel, table=True): def create_items(): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) - session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) - session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) - session.commit() + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) + session.commit() class TestSqlmodel: @@ -52,11 +52,11 @@ def test_orm(self): item2 = Item(embedding=[4, 5, 6]) item3 = Item() - session = Session(engine) - session.add(item) - session.add(item2) - session.add(item3) - session.commit() + with Session(engine) as session: + session.add(item) + session.add(item2) + session.add(item3) + session.commit() stmt = select(Item) with Session(engine) as session: @@ -71,11 +71,11 @@ def test_orm(self): assert items[2].embedding is None def test_vector(self): - session = Session(engine) - session.add(Item(id=1, embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.embedding.tolist() == [1, 2, 3] def test_vector_l2_distance(self): create_items() @@ -102,11 +102,11 @@ def test_vector_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] def test_halfvec(self): - session = Session(engine) - session.add(Item(id=1, half_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding.to_list() == [1, 2, 3] def test_halfvec_l2_distance(self): create_items() @@ -133,11 +133,11 @@ def test_halfvec_l1_distance(self): assert [v.id for v in items] == [1, 3, 2] def test_bit(self): - session = Session(engine) - session.add(Item(id=1, binary_embedding='101')) - session.commit() - item = session.get(Item, 1) - assert item.binary_embedding == '101' + with Session(engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' def test_bit_hamming_distance(self): create_items() @@ -152,11 +152,11 @@ def test_bit_jaccard_distance(self): assert [v.id for v in items] == [2, 3, 1] def test_sparsevec(self): - session = Session(engine) - session.add(Item(id=1, sparse_embedding=[1, 2, 3])) - session.commit() - item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + with Session(engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding.to_list() == [1, 2, 3] def test_sparsevec_l2_distance(self): create_items() @@ -232,7 +232,7 @@ def test_halfvec_sum(self): def test_bad_dimensions(self): item = Item(embedding=[1, 2]) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() From 2883156b461f08fe32be81439d8e653ac1c41c5a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:42:19 -0800 Subject: [PATCH 348/424] Improved tests for async SQLAlchemy engines [skip ci] --- tests/test_sqlalchemy.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 7dbc565..6e1d496 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -19,11 +19,18 @@ psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') engines = [psycopg2_engine, pg8000_engine] +async_engines = [] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') engines.append(psycopg_engine) + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async_engines.append(psycopg_async_engine) + + asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async_engines.append(asyncpg_engine) + setup_engine = engines[0] with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) @@ -531,14 +538,13 @@ def test_halfvec_array(self, engine): assert item.half_embeddings[1].to_list() == [4, 5, 6] +@pytest.mark.parametrize('engine', async_engines) class TestSqlalchemyAsync: def setup_method(self): delete_items() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_psycopg_async_avg(self): - engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async def test_psycopg_async_avg(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -550,6 +556,11 @@ async def test_psycopg_async_avg(self): await engine.dispose() + +class TestSqlalchemyAsync2: + def setup_method(self): + delete_items() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_psycopg_async_vector_array(self): From 86331f0ee6650adcdb655b5d092f1c24d3b0fa84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:45:22 -0800 Subject: [PATCH 349/424] Improved tests for async SQLAlchemy engines [skip ci] --- tests/test_sqlalchemy.py | 104 +++++++++++++++------------------------ 1 file changed, 39 insertions(+), 65 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 6e1d496..689615b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -544,122 +544,96 @@ def setup_method(self): delete_items() @pytest.mark.asyncio - async def test_psycopg_async_avg(self, engine): + async def test_vector(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: async with session.begin(): - session.add(Item(embedding=[1, 2, 3])) - session.add(Item(embedding=[4, 5, 6])) - avg = await session.scalars(select(func.avg(Item.embedding))) - assert avg.first() == '[2.5,3.5,4.5]' + embedding = np.array([1, 2, 3]) + session.add(Item(id=1, embedding=embedding)) + item = await session.get(Item, 1) + assert np.array_equal(item.embedding, embedding) await engine.dispose() - -class TestSqlalchemyAsync2: - def setup_method(self): - delete_items() - @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_psycopg_async_vector_array(self): - engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async def test_halfvec(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) - @event.listens_for(engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): - from pgvector.psycopg import register_vector_async - dbapi_connection.run_async(register_vector_async) - async with async_session() as session: async with session.begin(): - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - - # this fails if the driver does not cast arrays + embedding = [1, 2, 3] + session.add(Item(id=1, half_embedding=embedding)) item = await session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert item.half_embedding.to_list() == embedding await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_vector(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') - async_session = async_sessionmaker(engine, expire_on_commit=False) + async def test_bit(self, engine): + import asyncpg - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) + async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: async with session.begin(): - embedding = np.array([1, 2, 3]) - session.add(Item(id=1, embedding=embedding)) + embedding = asyncpg.BitString('101') if engine == asyncpg_engine else '101' + session.add(Item(id=1, binary_embedding=embedding)) item = await session.get(Item, 1) - assert np.array_equal(item.embedding, embedding) + assert item.binary_embedding == embedding await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_halfvec(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async def test_sparsevec(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) - async with async_session() as session: async with session.begin(): embedding = [1, 2, 3] - session.add(Item(id=1, half_embedding=embedding)) + session.add(Item(id=1, sparse_embedding=embedding)) item = await session.get(Item, 1) - assert item.half_embedding.to_list() == embedding + assert item.sparse_embedding.to_list() == embedding await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_bit(self): - import asyncpg - - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async def test_avg(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: async with session.begin(): - embedding = asyncpg.BitString('101') - session.add(Item(id=1, binary_embedding=embedding)) - item = await session.get(Item, 1) - assert item.binary_embedding == embedding + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + avg = await session.scalars(select(func.avg(Item.embedding))) + assert avg.first() == '[2.5,3.5,4.5]' await engine.dispose() + +class TestSqlalchemyAsync2: + def setup_method(self): + delete_items() + @pytest.mark.asyncio @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_asyncpg_sparsevec(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + async def test_psycopg_async_vector_array(self): + engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) + @event.listens_for(engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) async with async_session() as session: async with session.begin(): - embedding = [1, 2, 3] - session.add(Item(id=1, sparse_embedding=embedding)) + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + + # this fails if the driver does not cast arrays item = await session.get(Item, 1) - assert item.sparse_embedding.to_list() == embedding + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] await engine.dispose() From 224c18a47c2a2d652fe1f7267449e61c290b249f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 12:49:14 -0800 Subject: [PATCH 350/424] Simplified test code [skip ci] --- tests/test_sqlalchemy.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 689615b..07e29e7 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,3 +1,4 @@ +import asyncpg import numpy as np import os from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum @@ -571,8 +572,6 @@ async def test_halfvec(self, engine): @pytest.mark.asyncio async def test_bit(self, engine): - import asyncpg - async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -611,13 +610,13 @@ async def test_avg(self, engine): await engine.dispose() -class TestSqlalchemyAsync2: +@pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') +class TestSqlalchemyAsyncArray: def setup_method(self): delete_items() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') - async def test_psycopg_async_vector_array(self): + async def test_psycopg_vector_array(self): engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) @@ -638,7 +637,6 @@ def connect(dbapi_connection, connection_record): await engine.dispose() @pytest.mark.asyncio - @pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') async def test_asyncpg_vector_array(self): engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_session = async_sessionmaker(engine, expire_on_commit=False) From a1d89971cd10ebe0dc11969ec532011eaa8a9a78 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:02:05 -0800 Subject: [PATCH 351/424] Improved test code [skip ci] --- tests/test_sqlalchemy.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 07e29e7..8868df4 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -19,7 +19,17 @@ psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') +psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') + + +@event.listens_for(psycopg2_array_engine, "connect") +def psycopg2_connect(dbapi_connection, connection_record): + from pgvector.psycopg2 import register_vector + register_vector(dbapi_connection, globally=False, arrays=True) + + engines = [psycopg2_engine, pg8000_engine] +array_engines = [psycopg2_array_engine] async_engines = [] if sqlalchemy_version > 1: @@ -32,22 +42,6 @@ asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_engines.append(asyncpg_engine) -setup_engine = engines[0] -with Session(setup_engine) as session: - session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) - session.commit() - -psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -array_engines = [psycopg2_array_engine] - - -@event.listens_for(psycopg2_array_engine, "connect") -def psycopg2_connect(dbapi_connection, connection_record): - from pgvector.psycopg2 import register_vector - register_vector(dbapi_connection, globally=False, arrays=True) - - -if sqlalchemy_version > 1: psycopg_array_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') array_engines.append(psycopg_array_engine) @@ -56,6 +50,10 @@ def psycopg_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector register_vector(dbapi_connection) +setup_engine = engines[0] +with Session(setup_engine) as session: + session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) + session.commit() Base = declarative_base() From c792451a76fea51352ad0a5f952c97eaeaea70d7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:12:57 -0800 Subject: [PATCH 352/424] Test more engine configurations --- tests/test_sqlalchemy.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 8868df4..dffa07b 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -19,37 +19,39 @@ psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') -psycopg2_array_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +psycopg2_type_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -@event.listens_for(psycopg2_array_engine, "connect") +@event.listens_for(psycopg2_type_engine, "connect") def psycopg2_connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector register_vector(dbapi_connection, globally=False, arrays=True) -engines = [psycopg2_engine, pg8000_engine] -array_engines = [psycopg2_array_engine] +engines = [psycopg2_engine, pg8000_engine, psycopg2_type_engine] +array_engines = [psycopg2_type_engine] async_engines = [] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') engines.append(psycopg_engine) + psycopg_type_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + + @event.listens_for(psycopg_type_engine, "connect") + def psycopg_connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector + register_vector(dbapi_connection) + + engines.append(psycopg_type_engine) + array_engines.append(psycopg_type_engine) + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_engines.append(psycopg_async_engine) asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_engines.append(asyncpg_engine) - psycopg_array_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') - array_engines.append(psycopg_array_engine) - - @event.listens_for(psycopg_array_engine, "connect") - def psycopg_connect(dbapi_connection, connection_record): - from pgvector.psycopg import register_vector - register_vector(dbapi_connection) - setup_engine = engines[0] with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) @@ -169,9 +171,10 @@ def test_orm(self, engine): stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id in [1, 4, 7] - assert items[1].id in [2, 5, 8] - assert items[2].id in [3, 6, 9] + # TODO improve + assert items[0].id % 3 == 1 + assert items[1].id % 3 == 2 + assert items[2].id % 3 == 0 assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) From 88873e54365ca6086a1c960e2ced19ee98ea2bb2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:21:36 -0800 Subject: [PATCH 353/424] Improved tests for async SQLAlchemy engines [skip ci] --- tests/test_sqlalchemy.py | 58 +++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index dffa07b..a245ffc 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -31,6 +31,7 @@ def psycopg2_connect(dbapi_connection, connection_record): engines = [psycopg2_engine, pg8000_engine, psycopg2_type_engine] array_engines = [psycopg2_type_engine] async_engines = [] +async_array_engines = [] if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') @@ -46,11 +47,32 @@ def psycopg_connect(dbapi_connection, connection_record): engines.append(psycopg_type_engine) array_engines.append(psycopg_type_engine) + psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + + @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) + + async_engines.append(psycopg_async_type_engine) + async_array_engines.append(psycopg_async_type_engine) + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') async_engines.append(psycopg_async_engine) asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') async_engines.append(asyncpg_engine) + async_array_engines.append(asyncpg_engine) + + asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + + @event.listens_for(asyncpg_type_engine.sync_engine, "connect") + def connect(dbapi_connection, connection_record): + from pgvector.asyncpg import register_vector + dbapi_connection.run_async(register_vector) + + # TODO do not throw error when types are registered + # async_array_engines.append(asyncpg_type_engine) setup_engine = engines[0] with Session(setup_engine) as session: @@ -599,6 +621,10 @@ async def test_sparsevec(self, engine): @pytest.mark.asyncio async def test_avg(self, engine): + # TODO do not throw error when types are registered + if engine == psycopg_async_type_engine: + return + async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -611,43 +637,15 @@ async def test_avg(self, engine): await engine.dispose() -@pytest.mark.skipif(sqlalchemy_version == 1, reason='Requires SQLAlchemy 2+') +@pytest.mark.parametrize('engine', async_array_engines) class TestSqlalchemyAsyncArray: def setup_method(self): delete_items() @pytest.mark.asyncio - async def test_psycopg_vector_array(self): - engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + async def test_vector_array(self, engine): async_session = async_sessionmaker(engine, expire_on_commit=False) - @event.listens_for(engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): - from pgvector.psycopg import register_vector_async - dbapi_connection.run_async(register_vector_async) - - async with async_session() as session: - async with session.begin(): - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) - - # this fails if the driver does not cast arrays - item = await session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] - - await engine.dispose() - - @pytest.mark.asyncio - async def test_asyncpg_vector_array(self): - engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') - async_session = async_sessionmaker(engine, expire_on_commit=False) - - # TODO do not throw error when types are registered - # @event.listens_for(engine.sync_engine, "connect") - # def connect(dbapi_connection, connection_record): - # from pgvector.asyncpg import register_vector - # dbapi_connection.run_async(register_vector) - async with async_session() as session: async with session.begin(): session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) From e78a8d5f866c2577644a64eafcaf9939c4b9ab8c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:28:23 -0800 Subject: [PATCH 354/424] Improved test code [skip ci] --- tests/test_sqlalchemy.py | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index a245ffc..4b26922 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -18,7 +18,6 @@ sqlalchemy_version = 1 psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') psycopg2_type_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') @@ -28,15 +27,10 @@ def psycopg2_connect(dbapi_connection, connection_record): register_vector(dbapi_connection, globally=False, arrays=True) -engines = [psycopg2_engine, pg8000_engine, psycopg2_type_engine] -array_engines = [psycopg2_type_engine] -async_engines = [] -async_array_engines = [] +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') - engines.append(psycopg_engine) - psycopg_type_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_type_engine, "connect") @@ -44,9 +38,7 @@ def psycopg_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector register_vector(dbapi_connection) - engines.append(psycopg_type_engine) - array_engines.append(psycopg_type_engine) - + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") @@ -54,16 +46,7 @@ def connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector_async dbapi_connection.run_async(register_vector_async) - async_engines.append(psycopg_async_type_engine) - async_array_engines.append(psycopg_async_type_engine) - - psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') - async_engines.append(psycopg_async_engine) - asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') - async_engines.append(asyncpg_engine) - async_array_engines.append(asyncpg_engine) - asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') @event.listens_for(asyncpg_type_engine.sync_engine, "connect") @@ -71,8 +54,17 @@ def connect(dbapi_connection, connection_record): from pgvector.asyncpg import register_vector dbapi_connection.run_async(register_vector) - # TODO do not throw error when types are registered - # async_array_engines.append(asyncpg_type_engine) +engines = [psycopg2_engine, psycopg2_type_engine, pg8000_engine] +array_engines = [psycopg2_type_engine] +async_engines = [] +async_array_engines = [] + +if sqlalchemy_version > 1: + engines += [psycopg_engine, psycopg_type_engine] + array_engines += [psycopg_type_engine] + async_engines += [psycopg_async_engine, psycopg_async_type_engine, asyncpg_engine] + # TODO add asyncpg_type_engine + async_array_engines += [psycopg_async_type_engine, asyncpg_engine] setup_engine = engines[0] with Session(setup_engine) as session: From a2699639d7fd468ea68442d72227d5099ad8a64b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:33:09 -0800 Subject: [PATCH 355/424] Updated todo [skip ci] --- tests/test_sqlalchemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 4b26922..09df9b9 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -62,8 +62,8 @@ def connect(dbapi_connection, connection_record): if sqlalchemy_version > 1: engines += [psycopg_engine, psycopg_type_engine] array_engines += [psycopg_type_engine] + # TODO support asyncpg_type_engine async_engines += [psycopg_async_engine, psycopg_async_type_engine, asyncpg_engine] - # TODO add asyncpg_type_engine async_array_engines += [psycopg_async_type_engine, asyncpg_engine] setup_engine = engines[0] From 7cd310b5cf986fab1da536a94e9f9c74379e46b9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:36:12 -0800 Subject: [PATCH 356/424] Improved test [skip ci] --- tests/test_sqlalchemy.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 09df9b9..563e3a3 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -613,10 +613,6 @@ async def test_sparsevec(self, engine): @pytest.mark.asyncio async def test_avg(self, engine): - # TODO do not throw error when types are registered - if engine == psycopg_async_type_engine: - return - async_session = async_sessionmaker(engine, expire_on_commit=False) async with async_session() as session: @@ -624,7 +620,10 @@ async def test_avg(self, engine): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) avg = await session.scalars(select(func.avg(Item.embedding))) - assert avg.first() == '[2.5,3.5,4.5]' + if engine == psycopg_async_type_engine: + assert avg.first().tolist() == [2.5, 3.5, 4.5] + else: + assert avg.first() == '[2.5,3.5,4.5]' await engine.dispose() From cae30a1d1b0b23620abb9c9ff4c7084ca5bac1ee Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 13:37:07 -0800 Subject: [PATCH 357/424] Improved test [skip ci] --- tests/test_sqlalchemy.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 563e3a3..aa5d81f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -619,11 +619,8 @@ async def test_avg(self, engine): async with session.begin(): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - avg = await session.scalars(select(func.avg(Item.embedding))) - if engine == psycopg_async_type_engine: - assert avg.first().tolist() == [2.5, 3.5, 4.5] - else: - assert avg.first() == '[2.5,3.5,4.5]' + res = await session.scalars(select(avg(Item.embedding))) + assert res.first().tolist() == [2.5, 3.5, 4.5] await engine.dispose() From 3de7832d164b82e929e08d928501b081c93e3a5a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:27:35 -0800 Subject: [PATCH 358/424] Dropped support for Python < 3.9 --- .github/workflows/build.yml | 2 +- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 562ba94..dc53dfe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - python: [3.13, 3.8] + python: [3.13, 3.9] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a517d8..1788ff4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.0 (unreleased) + +- Dropped support for Python < 3.9 + ## 0.3.6 (2024-10-26) - Added `arrays` option for Psycopg 2 diff --git a/pyproject.toml b/pyproject.toml index a6a6609..0f291f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ {name = "Andrew Kane", email = "andrew@ankane.org"} ] license = {text = "MIT"} -requires-python = ">= 3.8" +requires-python = ">= 3.9" dependencies = [ "numpy" ] From 37b148f459863ee6f6c448fb93eecef569e7eb40 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:29:04 -0800 Subject: [PATCH 359/424] Removed default value [skip ci] --- pgvector/psycopg2/register.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 08a69a9..2be292f 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,11 +5,10 @@ from .vector import register_vector_info -# TODO remove default value for conn_or_curs in 0.4.0 # TODO make globally False by default in 0.4.0 # note: register_adapter is always global # TODO make arrays True by defalt in 0.4.0 -def register_vector(conn_or_curs=None, globally=True, arrays=False): +def register_vector(conn_or_curs, globally=True, arrays=False): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) scope = None if globally else conn_or_curs From 8a621a3ae96a85320475180b2120cb6d92c095a4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:33:27 -0800 Subject: [PATCH 360/424] Changed default values of globally and arrays for register_type with Psycopg 2 [skip ci] --- pgvector/psycopg2/register.py | 4 +--- tests/test_psycopg2.py | 8 ++++---- tests/test_sqlalchemy.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py index 2be292f..1bc9d44 100644 --- a/pgvector/psycopg2/register.py +++ b/pgvector/psycopg2/register.py @@ -5,10 +5,8 @@ from .vector import register_vector_info -# TODO make globally False by default in 0.4.0 # note: register_adapter is always global -# TODO make arrays True by defalt in 0.4.0 -def register_vector(conn_or_curs, globally=True, arrays=False): +def register_vector(conn_or_curs, globally=False, arrays=True): conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection cur = conn.cursor(cursor_factory=cursor) scope = None if globally else conn_or_curs diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index d661f12..85aa0e8 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -12,7 +12,7 @@ cur.execute('DROP TABLE IF EXISTS psycopg2_items') cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') -register_vector(cur, globally=False, arrays=True) +register_vector(cur) class TestPsycopg2: @@ -87,13 +87,13 @@ def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test') cur = conn.cursor(cursor_factory=cursor_factory) - register_vector(cur, globally=False) + register_vector(cur) conn.close() def test_cursor_factory_connection(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) - register_vector(conn, globally=False) + register_vector(conn) conn.close() def test_pool(self): @@ -102,7 +102,7 @@ def test_pool(self): conn = pool.getconn() try: # use globally=True for apps to ensure registered with all connections - register_vector(conn, globally=False) + register_vector(conn) finally: pool.putconn(conn) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index aa5d81f..067a153 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -24,7 +24,7 @@ @event.listens_for(psycopg2_type_engine, "connect") def psycopg2_connect(dbapi_connection, connection_record): from pgvector.psycopg2 import register_vector - register_vector(dbapi_connection, globally=False, arrays=True) + register_vector(dbapi_connection) pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') From 32b09c0272545322b90d38139ee625a9a7809a71 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:36:23 -0800 Subject: [PATCH 361/424] Fixed indices and values methods returning tuple instead of list in certain cases [skip ci] --- pgvector/utils/sparsevec.py | 3 +-- tests/test_psycopg.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index a370c5e..0398106 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -108,8 +108,7 @@ def from_binary(cls, value): dim, nnz, unused = unpack_from('>iii', value) indices = unpack_from(f'>{nnz}i', value, 12) values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) - # TODO convert indices and values to lists in 0.4.0 - return cls._from_parts(int(dim), indices, values) + return cls._from_parts(int(dim), list(indices), list(values)) @classmethod def _from_parts(cls, dim, indices, values): diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 90f80b6..cf5f09a 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -111,9 +111,8 @@ def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res.dimensions() == 6 - # TODO convert indices and values to lists in 0.4.0 - assert res.indices() == (0, 2, 4) - assert res.values() == (1.5, 2, 3) + assert res.indices() == [0, 2, 4] + assert res.values() == [1.5, 2, 3] assert res.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) From c10799c3dec3dea699fc4590d3c3baa688023b23 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:38:50 -0800 Subject: [PATCH 362/424] Added support for Vector class to Psycopg 2 [skip ci] --- pgvector/psycopg2/__init__.py | 3 ++- pgvector/psycopg2/vector.py | 1 + tests/test_psycopg2.py | 12 +++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 7c95295..f109203 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,8 +1,9 @@ from .register import register_vector -from ..utils import HalfVector, SparseVector +from ..utils import HalfVector, SparseVector, Vector __all__ = [ 'register_vector', + 'Vector', 'HalfVector', 'SparseVector' ] diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 9861f01..5bd00bb 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -24,3 +24,4 @@ def register_vector_info(oid, array_oid, scope): register_type(vectorarray, scope) register_adapter(np.ndarray, VectorAdapter) + register_adapter(Vector, VectorAdapter) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 85aa0e8..f927d86 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.psycopg2 import register_vector, HalfVector, SparseVector +from pgvector.psycopg2 import register_vector, Vector, HalfVector, SparseVector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor from psycopg2.pool import ThreadedConnectionPool @@ -29,6 +29,16 @@ def test_vector(self): assert res[0][0].dtype == np.float32 assert res[1][0] is None + def test_vector_class(self): + embedding = Vector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert np.array_equal(res[0][0], embedding.to_numpy()) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + def test_halfvec(self): embedding = [1.5, 2, 3] cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) From e138e55d187c1b03f9b827c6849aabb1f9697eaf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:50:52 -0800 Subject: [PATCH 363/424] Fixed equality for types [skip ci] --- pgvector/django/functions.py | 4 ++++ pgvector/utils/bit.py | 5 +++++ pgvector/utils/halfvec.py | 5 +++++ pgvector/utils/sparsevec.py | 5 +++++ pgvector/utils/vector.py | 5 +++++ tests/test_bit.py | 4 ++++ tests/test_half_vector.py | 4 ++++ tests/test_sparse_vector.py | 5 +++++ tests/test_vector.py | 4 ++++ 9 files changed, 41 insertions(+) diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index da9fbf8..6c14c3d 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -13,6 +13,10 @@ def __init__(self, expression, vector, **extra): vector = Value(SparseVector._to_db(vector)) else: vector = Value(Vector._to_db(vector)) + + # prevent error with unhashable types + self._constructor_args = ((expression, vector), extra) + super().__init__(expression, vector, **extra) diff --git a/pgvector/utils/bit.py b/pgvector/utils/bit.py index 51f7556..227edc1 100644 --- a/pgvector/utils/bit.py +++ b/pgvector/utils/bit.py @@ -24,6 +24,11 @@ def __init__(self, value): def __repr__(self): return f'Bit({self.to_text()})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + def to_list(self): return self._value.tolist() diff --git a/pgvector/utils/halfvec.py b/pgvector/utils/halfvec.py index e1e5051..f335f2f 100644 --- a/pgvector/utils/halfvec.py +++ b/pgvector/utils/halfvec.py @@ -16,6 +16,11 @@ def __init__(self, value): def __repr__(self): return f'HalfVector({self.to_list()})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + def dimensions(self): return len(self._value) diff --git a/pgvector/utils/sparsevec.py b/pgvector/utils/sparsevec.py index 0398106..8df2dfd 100644 --- a/pgvector/utils/sparsevec.py +++ b/pgvector/utils/sparsevec.py @@ -26,6 +26,11 @@ def __repr__(self): elements = dict(zip(self._indices, self._values)) return f'SparseVector({elements}, {self._dim})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.dimensions() == other.dimensions() and self.indices() == other.indices() and self.values() == other.values() + return False + def dimensions(self): return self._dim diff --git a/pgvector/utils/vector.py b/pgvector/utils/vector.py index 3fa2f35..ebbcafd 100644 --- a/pgvector/utils/vector.py +++ b/pgvector/utils/vector.py @@ -16,6 +16,11 @@ def __init__(self, value): def __repr__(self): return f'Vector({self.to_list()})' + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + def dimensions(self): return len(self._value) diff --git a/tests/test_bit.py b/tests/test_bit.py index 32ab87b..a7e0093 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -37,3 +37,7 @@ def test_ndim_zero(self): def test_repr(self): assert repr(Bit([True, False, True])) == 'Bit(101)' assert str(Bit([True, False, True])) == 'Bit(101)' + + def test_equality(self): + assert Bit([True, False, True]) == Bit([True, False, True]) + assert Bit([True, False, True]) != Bit([True, False, False]) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index fdaa5f7..77a7869 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -38,5 +38,9 @@ def test_repr(self): assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' assert str(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' + def test_equality(self): + assert HalfVector([1, 2, 3]) == HalfVector([1, 2, 3]) + assert HalfVector([1, 2, 3]) != HalfVector([1, 2, 4]) + def test_dimensions(self): assert HalfVector([1, 2, 3]).dimensions() == 3 diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 06fe81a..24d8c20 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -52,6 +52,11 @@ def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' + def test_equality(self): + assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector([1, 0, 2, 0, 3, 0]) + assert SparseVector([1, 0, 2, 0, 3, 0]) != SparseVector([1, 0, 2, 0, 3, 1]) + assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + def test_dimensions(self): assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 diff --git a/tests/test_vector.py b/tests/test_vector.py index 1be2bc0..fe14dea 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -38,5 +38,9 @@ def test_repr(self): assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' assert str(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' + def test_equality(self): + assert Vector([1, 2, 3]) == Vector([1, 2, 3]) + assert Vector([1, 2, 3]) != Vector([1, 2, 4]) + def test_dimensions(self): assert Vector([1, 2, 3]).dimensions() == 3 From 838ea0c73b1669c94de274eccd82f58d83ea55b4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 20:57:03 -0800 Subject: [PATCH 364/424] Moved classes to pgvector module [skip ci] --- pgvector/__init__.py | 11 +++++++++++ pgvector/{utils => }/bit.py | 0 pgvector/{utils => }/halfvec.py | 0 pgvector/{utils => }/sparsevec.py | 0 pgvector/utils/__init__.py | 5 +---- pgvector/{utils => }/vector.py | 0 6 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 pgvector/__init__.py rename pgvector/{utils => }/bit.py (100%) rename pgvector/{utils => }/halfvec.py (100%) rename pgvector/{utils => }/sparsevec.py (100%) rename pgvector/{utils => }/vector.py (100%) diff --git a/pgvector/__init__.py b/pgvector/__init__.py new file mode 100644 index 0000000..3c01160 --- /dev/null +++ b/pgvector/__init__.py @@ -0,0 +1,11 @@ +from .bit import Bit +from .halfvec import HalfVector +from .sparsevec import SparseVector +from .vector import Vector + +__all__ = [ + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] diff --git a/pgvector/utils/bit.py b/pgvector/bit.py similarity index 100% rename from pgvector/utils/bit.py rename to pgvector/bit.py diff --git a/pgvector/utils/halfvec.py b/pgvector/halfvec.py similarity index 100% rename from pgvector/utils/halfvec.py rename to pgvector/halfvec.py diff --git a/pgvector/utils/sparsevec.py b/pgvector/sparsevec.py similarity index 100% rename from pgvector/utils/sparsevec.py rename to pgvector/sparsevec.py diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 3c01160..1dcc240 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,7 +1,4 @@ -from .bit import Bit -from .halfvec import HalfVector -from .sparsevec import SparseVector -from .vector import Vector +from .. import Bit, HalfVector, SparseVector, Vector __all__ = [ 'Vector', diff --git a/pgvector/utils/vector.py b/pgvector/vector.py similarity index 100% rename from pgvector/utils/vector.py rename to pgvector/vector.py From 0ac00b4e3d39ea1ddefd8573588f7de2e60d112f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:00:08 -0800 Subject: [PATCH 365/424] Improved imports for tests [skip ci] --- tests/test_asyncpg.py | 3 ++- tests/test_bit.py | 2 +- tests/test_django.py | 3 ++- tests/test_half_vector.py | 2 +- tests/test_peewee.py | 3 ++- tests/test_psycopg.py | 3 ++- tests/test_psycopg2.py | 3 ++- tests/test_sparse_vector.py | 2 +- tests/test_sqlalchemy.py | 3 ++- tests/test_sqlmodel.py | 3 ++- tests/test_vector.py | 2 +- 11 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 48d1e32..3c36048 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,6 +1,7 @@ import asyncpg import numpy as np -from pgvector.asyncpg import register_vector, SparseVector +from pgvector import SparseVector +from pgvector.asyncpg import register_vector import pytest diff --git a/tests/test_bit.py b/tests/test_bit.py index a7e0093..e0dcfe6 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import Bit +from pgvector import Bit import pytest diff --git a/tests/test_django.py b/tests/test_django.py index ea15771..65082a3 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -12,7 +12,8 @@ import numpy as np import os import pgvector.django -from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance, HalfVector, SparseVector +from pgvector import HalfVector, SparseVector +from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance from unittest import mock settings.configure( diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 77a7869..6a94c2e 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import HalfVector +from pgvector import HalfVector import pytest diff --git a/tests/test_peewee.py b/tests/test_peewee.py index e98a0ec..670d880 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,8 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField, SparseVector +from pgvector import SparseVector +from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField db = PostgresqlDatabase('pgvector_python_test') diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index cf5f09a..6a9d0b7 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,5 +1,6 @@ import numpy as np -from pgvector.psycopg import register_vector, register_vector_async, Bit, HalfVector, SparseVector, Vector +from pgvector import Bit, HalfVector, SparseVector, Vector +from pgvector.psycopg import register_vector, register_vector_async import psycopg from psycopg_pool import ConnectionPool, AsyncConnectionPool import pytest diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index f927d86..1994c87 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,5 +1,6 @@ import numpy as np -from pgvector.psycopg2 import register_vector, Vector, HalfVector, SparseVector +from pgvector import HalfVector, SparseVector, Vector +from pgvector.psycopg2 import register_vector import psycopg2 from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor from psycopg2.pool import ThreadedConnectionPool diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 24d8c20..b5e7fe8 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import SparseVector +from pgvector import SparseVector import pytest from scipy.sparse import coo_array diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 067a153..052edd7 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,8 @@ import asyncpg import numpy as np import os -from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum +from pgvector import SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY from sqlalchemy.exc import StatementError diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 373834f..b0e8ccd 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,6 @@ import numpy as np -from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, SparseVector, avg, sum +from pgvector import SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy.exc import StatementError from sqlmodel import Field, Index, Session, SQLModel, create_engine, delete, select, text diff --git a/tests/test_vector.py b/tests/test_vector.py index fe14dea..406637f 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector.utils import Vector +from pgvector import Vector import pytest From 435e31654831d303342a1100a8dd32b6c1fe42a6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:03:28 -0800 Subject: [PATCH 366/424] Improved imports --- pgvector/asyncpg/__init__.py | 4 +++- pgvector/asyncpg/register.py | 2 +- pgvector/django/__init__.py | 4 +++- pgvector/django/functions.py | 2 +- pgvector/django/halfvec.py | 2 +- pgvector/django/sparsevec.py | 2 +- pgvector/django/vector.py | 2 +- pgvector/peewee/__init__.py | 4 +++- pgvector/peewee/halfvec.py | 2 +- pgvector/peewee/sparsevec.py | 2 +- pgvector/peewee/vector.py | 2 +- pgvector/psycopg/__init__.py | 4 +++- pgvector/psycopg/bit.py | 2 +- pgvector/psycopg/halfvec.py | 2 +- pgvector/psycopg/sparsevec.py | 2 +- pgvector/psycopg/vector.py | 2 +- pgvector/psycopg2/__init__.py | 4 +++- pgvector/psycopg2/halfvec.py | 2 +- pgvector/psycopg2/sparsevec.py | 2 +- pgvector/psycopg2/vector.py | 2 +- pgvector/sqlalchemy/__init__.py | 4 +++- pgvector/sqlalchemy/halfvec.py | 2 +- pgvector/sqlalchemy/sparsevec.py | 2 +- pgvector/sqlalchemy/vector.py | 2 +- 24 files changed, 36 insertions(+), 24 deletions(-) diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 543b882..c6a3b4e 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,5 +1,7 @@ from .register import register_vector -from ..utils import Vector, HalfVector, SparseVector + +# TODO remove +from .. import Vector, HalfVector, SparseVector __all__ = [ 'register_vector', diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py index a388058..63726f3 100644 --- a/pgvector/asyncpg/register.py +++ b/pgvector/asyncpg/register.py @@ -1,4 +1,4 @@ -from ..utils import Vector, HalfVector, SparseVector +from .. import Vector, HalfVector, SparseVector async def register_vector(conn, schema='public'): diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index 09978a9..43c64a3 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -5,7 +5,9 @@ from .indexes import IvfflatIndex, HnswIndex from .sparsevec import SparseVectorField from .vector import VectorField -from ..utils import HalfVector, SparseVector + +# TODO remove +from .. import HalfVector, SparseVector __all__ = [ 'VectorExtension', diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py index 6c14c3d..9df4fdb 100644 --- a/pgvector/django/functions.py +++ b/pgvector/django/functions.py @@ -1,5 +1,5 @@ from django.db.models import FloatField, Func, Value -from ..utils import Vector, HalfVector, SparseVector +from .. import Vector, HalfVector, SparseVector class DistanceBase(Func): diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py index 6b59a7f..3aeb90f 100644 --- a/pgvector/django/halfvec.py +++ b/pgvector/django/halfvec.py @@ -1,6 +1,6 @@ from django import forms from django.db.models import Field -from ..utils import HalfVector +from .. import HalfVector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py index d0d2d07..580f27c 100644 --- a/pgvector/django/sparsevec.py +++ b/pgvector/django/sparsevec.py @@ -1,6 +1,6 @@ from django import forms from django.db.models import Field -from ..utils import SparseVector +from .. import SparseVector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py index a89d540..861cfde 100644 --- a/pgvector/django/vector.py +++ b/pgvector/django/vector.py @@ -1,7 +1,7 @@ from django import forms from django.db.models import Field import numpy as np -from ..utils import Vector +from .. import Vector # https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 945e0dc..df21200 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -2,7 +2,9 @@ from .halfvec import HalfVectorField from .sparsevec import SparseVectorField from .vector import VectorField -from ..utils import HalfVector, SparseVector + +# TODO remove +from .. import HalfVector, SparseVector __all__ = [ 'VectorField', diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py index deaa14d..0901fd2 100644 --- a/pgvector/peewee/halfvec.py +++ b/pgvector/peewee/halfvec.py @@ -1,5 +1,5 @@ from peewee import Expression, Field -from ..utils import HalfVector +from .. import HalfVector class HalfVectorField(Field): diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py index 67f7d1b..86dea73 100644 --- a/pgvector/peewee/sparsevec.py +++ b/pgvector/peewee/sparsevec.py @@ -1,5 +1,5 @@ from peewee import Expression, Field -from ..utils import SparseVector +from .. import SparseVector class SparseVectorField(Field): diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py index 22a87e5..83f9997 100644 --- a/pgvector/peewee/vector.py +++ b/pgvector/peewee/vector.py @@ -1,5 +1,5 @@ from peewee import Expression, Field -from ..utils import Vector +from .. import Vector class VectorField(Field): diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index 9007c37..980af84 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,5 +1,7 @@ from .register import register_vector, register_vector_async -from ..utils import Bit, HalfVector, SparseVector, Vector + +# TODO remove +from .. import Bit, HalfVector, SparseVector, Vector __all__ = [ 'register_vector', diff --git a/pgvector/psycopg/bit.py b/pgvector/psycopg/bit.py index f8eeb61..cffe8fb 100644 --- a/pgvector/psycopg/bit.py +++ b/pgvector/psycopg/bit.py @@ -1,6 +1,6 @@ from psycopg.adapt import Dumper from psycopg.pq import Format -from ..utils import Bit +from .. import Bit class BitDumper(Dumper): diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py index 351d2cb..b3a0060 100644 --- a/pgvector/psycopg/halfvec.py +++ b/pgvector/psycopg/halfvec.py @@ -1,6 +1,6 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import HalfVector +from .. import HalfVector class HalfVectorDumper(Dumper): diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py index 435fd06..384a0e1 100644 --- a/pgvector/psycopg/sparsevec.py +++ b/pgvector/psycopg/sparsevec.py @@ -1,6 +1,6 @@ from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import SparseVector +from .. import SparseVector class SparseVectorDumper(Dumper): diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py index 0f62ca9..db9e826 100644 --- a/pgvector/psycopg/vector.py +++ b/pgvector/psycopg/vector.py @@ -1,7 +1,7 @@ import psycopg from psycopg.adapt import Loader, Dumper from psycopg.pq import Format -from ..utils import Vector +from .. import Vector class VectorDumper(Dumper): diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index f109203..b40c673 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,5 +1,7 @@ from .register import register_vector -from ..utils import HalfVector, SparseVector, Vector + +# TODO remove +from .. import HalfVector, SparseVector, Vector __all__ = [ 'register_vector', diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py index b50e89b..0a4c736 100644 --- a/pgvector/psycopg2/halfvec.py +++ b/pgvector/psycopg2/halfvec.py @@ -1,5 +1,5 @@ from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type -from ..utils import HalfVector +from .. import HalfVector class HalfvecAdapter: diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py index a542807..148eff2 100644 --- a/pgvector/psycopg2/sparsevec.py +++ b/pgvector/psycopg2/sparsevec.py @@ -1,5 +1,5 @@ from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type -from ..utils import SparseVector +from .. import SparseVector class SparsevecAdapter: diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py index 5bd00bb..562de18 100644 --- a/pgvector/psycopg2/vector.py +++ b/pgvector/psycopg2/vector.py @@ -1,6 +1,6 @@ import numpy as np from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type -from ..utils import Vector +from .. import Vector class VectorAdapter: diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 4955eeb..52adf88 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -4,7 +4,9 @@ from .sparsevec import SPARSEVEC from .vector import VECTOR from .vector import VECTOR as Vector -from ..utils import HalfVector, SparseVector + +# TODO remove +from .. import HalfVector, SparseVector __all__ = [ 'Vector', diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py index 639f77b..10688b5 100644 --- a/pgvector/sqlalchemy/halfvec.py +++ b/pgvector/sqlalchemy/halfvec.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import HalfVector +from .. import HalfVector class HALFVEC(UserDefinedType): diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py index 370f5d1..0058679 100644 --- a/pgvector/sqlalchemy/sparsevec.py +++ b/pgvector/sqlalchemy/sparsevec.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import SparseVector +from .. import SparseVector class SPARSEVEC(UserDefinedType): diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py index f57a045..5a1e11f 100644 --- a/pgvector/sqlalchemy/vector.py +++ b/pgvector/sqlalchemy/vector.py @@ -1,6 +1,6 @@ from sqlalchemy.dialects.postgresql.base import ischema_names from sqlalchemy.types import UserDefinedType, Float, String -from ..utils import Vector +from .. import Vector class VECTOR(UserDefinedType): From 08e29e1acdcdf03965f7ffb4e1e552688df51785 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:04:40 -0800 Subject: [PATCH 367/424] Added todo [skip ci] --- pgvector/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 1dcc240..8cdb5d6 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,3 +1,4 @@ +# TODO remove from .. import Bit, HalfVector, SparseVector, Vector __all__ = [ From 1c0ff62b65718899915cd51466c63b9b60c3787f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:50:36 -0800 Subject: [PATCH 368/424] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1788ff4..df60740 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.4.0 (unreleased) +- Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes - Dropped support for Python < 3.9 ## 0.3.6 (2024-10-26) From f618edb43395795451d3079b1def7f6c8cbb76ba Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 21:55:07 -0800 Subject: [PATCH 369/424] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index df60740..24aebf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.0 (unreleased) - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes +- Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases - Dropped support for Python < 3.9 ## 0.3.6 (2024-10-26) From 537f3ba72519314b2bc5b65f7d625594a496345e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 22:01:16 -0800 Subject: [PATCH 370/424] Updated changelog [skip ci] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24aebf9..42e9bff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.4.0 (unreleased) +- Added top-level `pgvector` package - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes - Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases - Dropped support for Python < 3.9 From 571bf4287cfe8a2371f477250c02efa3f62f67a1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Feb 2025 22:03:52 -0800 Subject: [PATCH 371/424] Updated changelog [skip ci] --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42e9bff..f53a2ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ ## 0.4.0 (unreleased) - Added top-level `pgvector` package +- Changed `globally` option to default to `False` for Psycopg 2 +- Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes - Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases - Dropped support for Python < 3.9 From 1676e3ead391493375ff6958a5b80c78080cf01e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 17:13:55 -0800 Subject: [PATCH 372/424] Test SQLAlchemy 1 on CI --- .github/workflows/build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dc53dfe..4d4e8ed 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,3 +24,6 @@ jobs: make sudo make install - run: pytest + + - run: pip install "SQLAlchemy<2" -U + - run: pytest tests/test_sqlalchemy.py From ac9fd532f77c1497df250e519238f7f5d627f645 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 17:49:10 -0800 Subject: [PATCH 373/424] Improved Bit constructor for uint8 NumPy arrays --- pgvector/bit.py | 11 ++++------- tests/test_bit.py | 4 +--- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 227edc1..36da723 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -7,14 +7,11 @@ def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value else: - # TODO change in 0.4.0 # TODO raise if dtype not bool or uint8 - # if isinstance(value, np.ndarray) and value.dtype == np.uint8: - # value = np.unpackbits(value) - # else: - # value = np.asarray(value, dtype=bool) - - value = np.asarray(value, dtype=bool) + if isinstance(value, np.ndarray) and value.dtype == np.uint8: + value = np.unpackbits(value) + else: + value = np.asarray(value, dtype=bool) if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index e0dcfe6..1d771ca 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -15,9 +15,7 @@ def test_str(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - # TODO change in 0.4.0 - # assert Bit(arr).to_text() == '111111100000011100000000' - assert Bit(arr).to_text() == '110' + assert Bit(arr).to_text() == '111111100000011100000000' def test_ndarray_same_object(self): arr = np.array([True, False, True]) From 1b25460e6184bb744b9c71c9c5b95852bdf7c63f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:02:23 -0800 Subject: [PATCH 374/424] Raise error for unexpected dtype for Bit constructor [skip ci] --- pgvector/bit.py | 8 +++++--- tests/test_bit.py | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 36da723..4be7385 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -7,9 +7,11 @@ def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value else: - # TODO raise if dtype not bool or uint8 - if isinstance(value, np.ndarray) and value.dtype == np.uint8: - value = np.unpackbits(value) + if isinstance(value, np.ndarray): + if value.dtype == np.uint8: + value = np.unpackbits(value).astype(bool) + elif value.dtype != np.bool: + raise ValueError('expected dtype to be bool or uint8') else: value = np.asarray(value, dtype=bool) diff --git a/tests/test_bit.py b/tests/test_bit.py index 1d771ca..5e1bff2 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -17,6 +17,12 @@ def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) assert Bit(arr).to_text() == '111111100000011100000000' + def test_ndarray_uint16(self): + arr = np.array([254, 7, 0], dtype=np.uint16) + with pytest.raises(ValueError) as error: + Bit(arr) + assert str(error.value) == 'expected dtype to be bool or uint8' + def test_ndarray_same_object(self): arr = np.array([True, False, True]) assert Bit(arr).to_list() == [True, False, True] From 8b927161f7856415436159d0b2c804280261a759 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:11:57 -0800 Subject: [PATCH 375/424] Improved asyncpg tests [skip ci] --- tests/test_asyncpg.py | 45 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 3c36048..982ea8d 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,6 +1,6 @@ import asyncpg import numpy as np -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector, Vector from pgvector.asyncpg import register_vector import pytest @@ -15,13 +15,15 @@ async def test_vector(self): await register_vector(conn) - embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + embedding = Vector([1.5, 2, 3]) + embedding2 = np.array([4.5, 5, 6]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert np.array_equal(res[0]['embedding'], embedding) + assert np.array_equal(res[0]['embedding'], embedding.to_numpy()) assert res[0]['embedding'].dtype == np.float32 - assert res[1]['embedding'] is None + assert np.array_equal(res[1]['embedding'], embedding2) + assert res[2]['embedding'] is None # ensures binary format is correct text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") @@ -38,12 +40,14 @@ async def test_halfvec(self): await register_vector(conn) - embedding = [1.5, 2, 3] - await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + embedding = HalfVector([1.5, 2, 3]) + embedding2 = [4.5, 5, 6] + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['embedding'].to_list() == [1.5, 2, 3] - assert res[1]['embedding'] is None + assert res[0]['embedding'] == embedding + assert res[1]['embedding'] == HalfVector(embedding2) + assert res[2]['embedding'] is None # ensures binary format is correct text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") @@ -87,7 +91,7 @@ async def test_sparsevec(self): await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert res[0]['embedding'].to_list() == [1.5, 2, 3] + assert res[0]['embedding'] == embedding assert res[1]['embedding'] is None # ensures binary format is correct @@ -105,12 +109,15 @@ async def test_vector_array(self): await register_vector(conn) - embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] - await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[])", embeddings[0], embeddings[1]) + embeddings = [Vector([1.5, 2, 3]), Vector([4.5, 5, 6])] + embeddings2 = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[]), (ARRAY[$3, $4]::vector[])", embeddings[0], embeddings[1], embeddings2[0], embeddings2[1]) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert np.array_equal(res[0]['embeddings'][0], embeddings[0]) - assert np.array_equal(res[0]['embeddings'][1], embeddings[1]) + assert np.array_equal(res[0]['embeddings'][0], embeddings[0].to_numpy()) + assert np.array_equal(res[0]['embeddings'][1], embeddings[1].to_numpy()) + assert np.array_equal(res[1]['embeddings'][0], embeddings2[0]) + assert np.array_equal(res[1]['embeddings'][1], embeddings2[1]) await conn.close() @@ -126,10 +133,12 @@ async def init(conn): await conn.execute('DROP TABLE IF EXISTS asyncpg_items') await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding vector(3))') - embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + embedding = Vector([1.5, 2, 3]) + embedding2 = np.array([1.5, 2, 3]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") - assert np.array_equal(res[0]['embedding'], embedding) + assert np.array_equal(res[0]['embedding'], embedding.to_numpy()) assert res[0]['embedding'].dtype == np.float32 - assert res[1]['embedding'] is None + assert np.array_equal(res[1]['embedding'], embedding2) + assert res[2]['embedding'] is None From 9f825f2e8360a4f6ec8af0341584817e5191008c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:14:09 -0800 Subject: [PATCH 376/424] Improved asyncpg test [skip ci] --- tests/test_asyncpg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 982ea8d..34d66a1 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -110,8 +110,10 @@ async def test_vector_array(self): await register_vector(conn) embeddings = [Vector([1.5, 2, 3]), Vector([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES ($1)", embeddings) + embeddings2 = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] - await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[]), (ARRAY[$3, $4]::vector[])", embeddings[0], embeddings[1], embeddings2[0], embeddings2[1]) + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[])", embeddings2[0], embeddings2[1]) res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") assert np.array_equal(res[0]['embeddings'][0], embeddings[0].to_numpy()) From bb3b32ccf9718c3675767de3e226d3638c1f82ea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:20:20 -0800 Subject: [PATCH 377/424] Improved tests [skip ci] --- tests/test_peewee.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 670d880..d7028c3 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,7 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField db = PostgresqlDatabase('pgvector_python_test') @@ -77,7 +77,7 @@ def test_vector_l1_distance(self): def test_halfvec(self): Item.create(id=1, half_embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self): create_items() @@ -129,7 +129,7 @@ def test_bit_jaccard_distance(self): def test_sparsevec(self): Item.create(id=1, sparse_embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self): create_items() @@ -186,7 +186,7 @@ def test_halfvec_avg(self): Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() - assert avg.to_list() == [2.5, 3.5, 4.5] + assert avg == HalfVector([2.5, 3.5, 4.5]) def test_halfvec_sum(self): sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() @@ -194,7 +194,7 @@ def test_halfvec_sum(self): Item.create(half_embedding=[1, 2, 3]) Item.create(half_embedding=[4, 5, 6]) sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() - assert sum.to_list() == [5, 7, 9] + assert sum == HalfVector([5, 7, 9]) def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) From c7cd058ea3145fd7cdcb45f712c0f4450ddbe16e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:24:06 -0800 Subject: [PATCH 378/424] Improved tests [skip ci] --- tests/test_django.py | 12 ++++++------ tests/test_psycopg.py | 8 ++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index 65082a3..f187ad4 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -199,7 +199,7 @@ def test_vector_l1_distance(self): def test_halfvec(self): Item(id=1, half_embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self): create_items() @@ -251,7 +251,7 @@ def test_bit_jaccard_distance(self): def test_sparsevec(self): Item(id=1, sparse_embedding=SparseVector([1, 2, 3])).save() item = Item.objects.get(pk=1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self): create_items() @@ -309,7 +309,7 @@ def test_halfvec_avg(self): Item(half_embedding=[1, 2, 3]).save() Item(half_embedding=[4, 5, 6]).save() avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] - assert avg.to_list() == [2.5, 3.5, 4.5] + assert avg == HalfVector([2.5, 3.5, 4.5]) def test_halfvec_sum(self): sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] @@ -317,7 +317,7 @@ def test_halfvec_sum(self): Item(half_embedding=[1, 2, 3]).save() Item(half_embedding=[4, 5, 6]).save() sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] - assert sum.to_list() == [5, 7, 9] + assert sum == HalfVector([5, 7, 9]) def test_serialization(self): create_items() @@ -375,7 +375,7 @@ def test_halfvec_form_save(self): assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).half_embedding.to_list() + assert Item.objects.get(pk=1).half_embedding == HalfVector([4, 5, 6]) def test_halfvec_form_save_missing(self): Item(id=1).save() @@ -432,7 +432,7 @@ def test_sparsevec_form_save(self): assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).sparse_embedding.to_list() + assert Item.objects.get(pk=1).sparse_embedding == SparseVector([4, 5, 6]) def test_sparesevec_form_save_missing(self): Item(id=1).save() diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 6a9d0b7..e2a40b2 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -69,17 +69,19 @@ def test_halfvec(self): conn.execute('INSERT INTO psycopg_items (half_embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT half_embedding FROM psycopg_items ORDER BY id').fetchone()[0] - assert res.to_list() == [1.5, 2, 3] + assert res == HalfVector([1.5, 2, 3]) def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) @@ -106,11 +108,12 @@ def test_sparsevec(self): conn.execute('INSERT INTO psycopg_items (sparse_embedding) VALUES (%s)', (embedding,)) res = conn.execute('SELECT sparse_embedding FROM psycopg_items ORDER BY id').fetchone()[0] - assert res.to_list() == [1.5, 2, 3] + assert res == SparseVector([1.5, 2, 3]) def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] + assert res == embedding assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] @@ -120,6 +123,7 @@ def test_sparsevec_binary_format(self): def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] + assert res == embedding assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] From 8441b463ccc8738a951dd8fd2c9ac8b8b292c774 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:26:05 -0800 Subject: [PATCH 379/424] Improved tests [skip ci] --- tests/test_psycopg2.py | 12 ++++++------ tests/test_sparse_vector.py | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 1994c87..71e0015 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -46,7 +46,7 @@ def test_halfvec(self): cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert res[0][0].to_list() == [1.5, 2, 3] + assert res[0][0] == HalfVector([1.5, 2, 3]) assert res[1][0] is None def test_bit(self): @@ -64,7 +64,7 @@ def test_sparsevec(self): cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert res[0][0].to_list() == [1.5, 2, 3] + assert res[0][0] == SparseVector([1.5, 2, 3]) assert res[1][0] is None def test_vector_array(self): @@ -82,8 +82,8 @@ def test_halfvec_array(self): cur.execute('SELECT half_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0].to_list() == [1.5, 2, 3] - assert res[0][1].to_list() == [4.5, 5, 6] + assert res[0][0] == HalfVector([1.5, 2, 3]) + assert res[0][1] == HalfVector([4.5, 5, 6]) def test_sparsevec_array(self): embeddings = [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] @@ -91,8 +91,8 @@ def test_sparsevec_array(self): cur.execute('SELECT sparse_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0].to_list() == [1.5, 2, 3] - assert res[0][1].to_list() == [4.5, 5, 6] + assert res[0][0] == SparseVector([1.5, 2, 3]) + assert res[0][1] == SparseVector([4.5, 5, 6]) def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index b5e7fe8..29c3ea7 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -56,6 +56,7 @@ def test_equality(self): assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector([1, 0, 2, 0, 3, 0]) assert SparseVector([1, 0, 2, 0, 3, 0]) != SparseVector([1, 0, 2, 0, 3, 1]) assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + assert SparseVector({}, 1) != SparseVector({}, 2) def test_dimensions(self): assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 From 6b8857a3146cf581bebcf32eb81a37135aa2fc15 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:28:20 -0800 Subject: [PATCH 380/424] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 052edd7..d791bd6 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import asyncpg import numpy as np import os -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY @@ -256,7 +256,7 @@ def test_halfvec(self, engine): session.add(Item(id=1, half_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self, engine): create_items() @@ -348,7 +348,7 @@ def test_sparsevec(self, engine): session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self, engine): create_items() @@ -551,8 +551,8 @@ def test_halfvec_array(self, engine): # this fails if the driver does not cast arrays item = session.get(Item, 1) - assert item.half_embeddings[0].to_list() == [1, 2, 3] - assert item.half_embeddings[1].to_list() == [4, 5, 6] + assert item.half_embeddings[0] == HalfVector([1, 2, 3]) + assert item.half_embeddings[1] == HalfVector([4, 5, 6]) @pytest.mark.parametrize('engine', async_engines) @@ -582,7 +582,7 @@ async def test_halfvec(self, engine): embedding = [1, 2, 3] session.add(Item(id=1, half_embedding=embedding)) item = await session.get(Item, 1) - assert item.half_embedding.to_list() == embedding + assert item.half_embedding == HalfVector(embedding) await engine.dispose() @@ -608,7 +608,7 @@ async def test_sparsevec(self, engine): embedding = [1, 2, 3] session.add(Item(id=1, sparse_embedding=embedding)) item = await session.get(Item, 1) - assert item.sparse_embedding.to_list() == embedding + assert item.sparse_embedding == SparseVector(embedding) await engine.dispose() From 022dd061b4ebdfb7e39b23abe782bc2d89ec9e98 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:29:57 -0800 Subject: [PATCH 381/424] Improved tests [skip ci] --- tests/test_sqlalchemy.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index d791bd6..4b1e516 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,7 +1,7 @@ import asyncpg import numpy as np import os -from pgvector import HalfVector, SparseVector +from pgvector import HalfVector, SparseVector, Vector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY @@ -637,9 +637,14 @@ async def test_vector_array(self, engine): async with async_session() as session: async with session.begin(): - session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.add(Item(id=1, embeddings=[Vector([1, 2, 3]), Vector([4, 5, 6])])) item = await session.get(Item, 1) assert item.embeddings[0].tolist() == [1, 2, 3] assert item.embeddings[1].tolist() == [4, 5, 6] + session.add(Item(id=2, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + item = await session.get(Item, 2) + assert item.embeddings[0].tolist() == [1, 2, 3] + assert item.embeddings[1].tolist() == [4, 5, 6] + await engine.dispose() From bb02ee2742714cb4b566b95deb71a82539977dd4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:48:17 -0800 Subject: [PATCH 382/424] Improved tests [skip ci] --- tests/test_sqlmodel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index b0e8ccd..8a472b1 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -1,5 +1,5 @@ import numpy as np -from pgvector import SparseVector +from pgvector import HalfVector, SparseVector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest from sqlalchemy.exc import StatementError @@ -107,7 +107,7 @@ def test_halfvec(self): session.add(Item(id=1, half_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.half_embedding.to_list() == [1, 2, 3] + assert item.half_embedding == HalfVector([1, 2, 3]) def test_halfvec_l2_distance(self): create_items() @@ -157,7 +157,7 @@ def test_sparsevec(self): session.add(Item(id=1, sparse_embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.sparse_embedding.to_list() == [1, 2, 3] + assert item.sparse_embedding == SparseVector([1, 2, 3]) def test_sparsevec_l2_distance(self): create_items() @@ -220,7 +220,7 @@ def test_halfvec_avg(self): session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) res = session.exec(select(avg(Item.half_embedding))).first() - assert res.to_list() == [2.5, 3.5, 4.5] + assert res == HalfVector([2.5, 3.5, 4.5]) def test_halfvec_sum(self): with Session(engine) as session: @@ -229,7 +229,7 @@ def test_halfvec_sum(self): session.add(Item(half_embedding=[1, 2, 3])) session.add(Item(half_embedding=[4, 5, 6])) res = session.exec(select(sum(Item.half_embedding))).first() - assert res.to_list() == [5, 7, 9] + assert res == HalfVector([5, 7, 9]) def test_bad_dimensions(self): item = Item(embedding=[1, 2]) From 340caa58195fc5e7b99eed8ab7fe4e4e912fd73c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:53:17 -0800 Subject: [PATCH 383/424] Improved tests [skip ci] --- tests/test_psycopg.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index e2a40b2..0859be7 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -75,6 +75,7 @@ def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) + # TODO move assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) @@ -82,6 +83,7 @@ def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) + # TODO move assert res.to_list() == [1.5, 2, 3] assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) @@ -114,6 +116,7 @@ def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res == embedding + # TODO move assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] @@ -124,6 +127,7 @@ def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res == embedding + # TODO move assert res.dimensions() == 6 assert res.indices() == [0, 2, 4] assert res.values() == [1.5, 2, 3] @@ -166,8 +170,8 @@ def test_binary_copy_to(self): cur = conn.cursor() with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: for row in copy.rows(): - assert Vector.from_binary(row[0]).to_list() == [1.5, 2, 3] - assert HalfVector.from_binary(row[1]).to_list() == [1.5, 2, 3] + assert np.array_equal(Vector.from_binary(row[0]).to_numpy(), embedding) + assert HalfVector.from_binary(row[1]) == half_embedding def test_binary_copy_to_set_types(self): embedding = np.array([1.5, 2, 3]) @@ -178,7 +182,7 @@ def test_binary_copy_to_set_types(self): copy.set_types(['vector', 'halfvec']) for row in copy.rows(): assert np.array_equal(row[0], embedding) - assert row[1].to_list() == [1.5, 2, 3] + assert row[1] == half_embedding def test_vector_array(self): embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] From e6edb2a68f1a93df94c725c6d5ba29654694feab Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 18:56:52 -0800 Subject: [PATCH 384/424] Improved tests [skip ci] --- tests/test_half_vector.py | 5 +++++ tests/test_psycopg.py | 9 --------- tests/test_sparse_vector.py | 8 ++++++++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 6a94c2e..a17699a 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -44,3 +44,8 @@ def test_equality(self): def test_dimensions(self): assert HalfVector([1, 2, 3]).dimensions() == 3 + + def test_from_text(self): + vec = HalfVector.from_text('[1.5,2,3]') + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 0859be7..24ab321 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -83,9 +83,6 @@ def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) - # TODO move - assert res.to_list() == [1.5, 2, 3] - assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_bit(self): embedding = Bit([True, False, True]) @@ -127,12 +124,6 @@ def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] assert res == embedding - # TODO move - assert res.dimensions() == 6 - assert res.indices() == [0, 2, 4] - assert res.values() == [1.5, 2, 3] - assert res.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_text_copy_from(self): embedding = np.array([1.5, 2, 3]) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 29c3ea7..fb01b5e 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -73,3 +73,11 @@ def test_to_coo(self): def test_zero_vector_text(self): vec = SparseVector({}, 3) assert vec.to_list() == SparseVector.from_text(vec.to_text()).to_list() + + def test_from_text(self): + vec = SparseVector.from_text('{1:1.5,3:2,5:3}/6') + assert vec.dimensions() == 6 + assert vec.indices() == [0, 2, 4] + assert vec.values() == [1.5, 2, 3] + assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) From b57a2e9ed35c68eaa22afe27ce93401b0190adc2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:06:11 -0800 Subject: [PATCH 385/424] Improved tests [skip ci] --- tests/test_half_vector.py | 8 ++++++++ tests/test_psycopg.py | 9 --------- tests/test_sparse_vector.py | 11 +++++++++++ tests/test_vector.py | 13 +++++++++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index a17699a..9c0b041 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -1,6 +1,7 @@ import numpy as np from pgvector import HalfVector import pytest +from struct import pack class TestHalfVector: @@ -49,3 +50,10 @@ def test_from_text(self): vec = HalfVector.from_text('[1.5,2,3]') assert vec.to_list() == [1.5, 2, 3] assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + + def test_from_binary(self): + data = pack('>HH3e', 3, 0, *[1.5, 2, 3]) + vec = HalfVector.from_binary(data) + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert vec.to_binary() == data diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 24ab321..f61b4e3 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -75,9 +75,6 @@ def test_halfvec_binary_format(self): embedding = HalfVector([1.5, 2, 3]) res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] assert res == HalfVector([1.5, 2, 3]) - # TODO move - assert res.to_list() == [1.5, 2, 3] - assert np.array_equal(res.to_numpy(), np.array([1.5, 2, 3])) def test_halfvec_text_format(self): embedding = HalfVector([1.5, 2, 3]) @@ -113,12 +110,6 @@ def test_sparsevec_binary_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] assert res == embedding - # TODO move - assert res.dimensions() == 6 - assert res.indices() == [0, 2, 4] - assert res.values() == [1.5, 2, 3] - assert res.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(res.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_sparsevec_text_format(self): embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index fb01b5e..fb51db9 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -2,6 +2,7 @@ from pgvector import SparseVector import pytest from scipy.sparse import coo_array +from struct import pack class TestSparseVector: @@ -81,3 +82,13 @@ def test_from_text(self): assert vec.values() == [1.5, 2, 3] assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + + def test_from_binary(self): + data = pack('>iii3i3f', 6, 3, 0, *[0, 2, 4], *[1.5, 2, 3]) + vec = SparseVector.from_binary(data) + assert vec.dimensions() == 6 + assert vec.indices() == [0, 2, 4] + assert vec.values() == [1.5, 2, 3] + assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + assert vec.to_binary() == data diff --git a/tests/test_vector.py b/tests/test_vector.py index 406637f..094dd34 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -1,6 +1,7 @@ import numpy as np from pgvector import Vector import pytest +from struct import pack class TestVector: @@ -44,3 +45,15 @@ def test_equality(self): def test_dimensions(self): assert Vector([1, 2, 3]).dimensions() == 3 + + def test_from_text(self): + vec = Vector.from_text('[1.5,2,3]') + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + + def test_from_binary(self): + data = pack('>HH3f', 3, 0, *[1.5, 2, 3]) + vec = Vector.from_binary(data) + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert vec.to_binary() == data From b6ccb3043fb1dac552b4dcdf6ecb947434d3b234 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:06:51 -0800 Subject: [PATCH 386/424] Improved tests [skip ci] --- tests/test_half_vector.py | 2 +- tests/test_sparse_vector.py | 2 +- tests/test_vector.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 9c0b041..756adc2 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -52,7 +52,7 @@ def test_from_text(self): assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) def test_from_binary(self): - data = pack('>HH3e', 3, 0, *[1.5, 2, 3]) + data = pack('>HH3e', 3, 0, 1.5, 2, 3) vec = HalfVector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index fb51db9..cf5b016 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -84,7 +84,7 @@ def test_from_text(self): assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) def test_from_binary(self): - data = pack('>iii3i3f', 6, 3, 0, *[0, 2, 4], *[1.5, 2, 3]) + data = pack('>iii3i3f', 6, 3, 0, 0, 2, 4, 1.5, 2, 3) vec = SparseVector.from_binary(data) assert vec.dimensions() == 6 assert vec.indices() == [0, 2, 4] diff --git a/tests/test_vector.py b/tests/test_vector.py index 094dd34..c367a7a 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -52,7 +52,7 @@ def test_from_text(self): assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) def test_from_binary(self): - data = pack('>HH3f', 3, 0, *[1.5, 2, 3]) + data = pack('>HH3f', 3, 0, 1.5, 2, 3) vec = Vector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) From e566d4c9b4968b232c2348e9e608d06ee90b6253 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:20:13 -0800 Subject: [PATCH 387/424] Improved tests [skip ci] --- tests/test_django.py | 12 ++++++------ tests/test_half_vector.py | 4 ++-- tests/test_peewee.py | 10 +++++----- tests/test_psycopg.py | 12 ++++++------ tests/test_psycopg2.py | 2 +- tests/test_sparse_vector.py | 8 ++++---- tests/test_sqlalchemy.py | 18 +++++++++--------- tests/test_sqlmodel.py | 2 +- tests/test_vector.py | 4 ++-- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tests/test_django.py b/tests/test_django.py index f187ad4..7a8a6eb 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -165,7 +165,7 @@ def setup_method(self): def test_vector(self): Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - assert np.array_equal(item.embedding, np.array([1, 2, 3])) + assert np.array_equal(item.embedding, [1, 2, 3]) assert item.embedding.dtype == np.float32 def test_vector_l2_distance(self): @@ -293,7 +293,7 @@ def test_vector_avg(self): Item(embedding=[1, 2, 3]).save() Item(embedding=[4, 5, 6]).save() avg = Item.objects.aggregate(Avg('embedding'))['embedding__avg'] - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + assert np.array_equal(avg, [2.5, 3.5, 4.5]) def test_vector_sum(self): sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] @@ -301,7 +301,7 @@ def test_vector_sum(self): Item(embedding=[1, 2, 3]).save() Item(embedding=[4, 5, 6]).save() sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] - assert np.array_equal(sum, np.array([5, 7, 9])) + assert np.array_equal(sum, [5, 7, 9]) def test_halfvec_avg(self): avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] @@ -347,7 +347,7 @@ def test_vector_form_save(self): assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).embedding.tolist() + assert np.array_equal(Item.objects.get(pk=1).embedding, [4, 5, 6]) def test_vector_form_save_missing(self): Item(id=1).save() @@ -465,8 +465,8 @@ def test_vector_array(self): # this fails if the driver does not cast arrays item = Item.objects.get(pk=1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) def test_double_array(self): Item(id=1, double_embedding=[1, 1, 1]).save() diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py index 756adc2..78b4977 100644 --- a/tests/test_half_vector.py +++ b/tests/test_half_vector.py @@ -49,11 +49,11 @@ def test_dimensions(self): def test_from_text(self): vec = HalfVector.from_text('[1.5,2,3]') assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) def test_from_binary(self): data = pack('>HH3e', 3, 0, 1.5, 2, 3) vec = HalfVector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) assert vec.to_binary() == data diff --git a/tests/test_peewee.py b/tests/test_peewee.py index d7028c3..64fc009 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -43,7 +43,7 @@ def setup_method(self): def test_vector(self): Item.create(id=1, embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert np.array_equal(item.embedding, np.array([1, 2, 3])) + assert np.array_equal(item.embedding, [1, 2, 3]) assert item.embedding.dtype == np.float32 def test_vector_l2_distance(self): @@ -170,7 +170,7 @@ def test_vector_avg(self): Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) avg = Item.select(fn.avg(Item.embedding).coerce(True)).scalar() - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + assert np.array_equal(avg, [2.5, 3.5, 4.5]) def test_vector_sum(self): sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() @@ -178,7 +178,7 @@ def test_vector_sum(self): Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() - assert np.array_equal(sum, np.array([5, 7, 9])) + assert np.array_equal(sum, [5, 7, 9]) def test_halfvec_avg(self): avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() @@ -220,5 +220,5 @@ class Meta: # fails with column "embeddings" is of type vector[] but expression is of type text[] # ExtItem.create(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]) # item = ExtItem.get_by_id(1) - # assert np.array_equal(item.embeddings[0], np.array([1, 2, 3])) - # assert np.array_equal(item.embeddings[1], np.array([4, 5, 6])) + # assert np.array_equal(item.embeddings[0], [1, 2, 3]) + # assert np.array_equal(item.embeddings[1], [4, 5, 6]) diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index f61b4e3..698b34f 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -46,23 +46,23 @@ def test_vector_text_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([3, 2, 1.5])) + assert np.array_equal(res, [3, 2, 1.5]) def test_vector_binary_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([3, 2, 1.5])) + assert np.array_equal(res, [3, 2, 1.5]) def test_vector_class_binary_format(self): embedding = Vector([1.5, 2, 3]) res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] - assert np.array_equal(res, np.array([1.5, 2, 3])) + assert np.array_equal(res, [1.5, 2, 3]) def test_vector_class_text_format(self): embedding = Vector([1.5, 2, 3]) res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([1.5, 2, 3])) + assert np.array_equal(res, [1.5, 2, 3]) def test_halfvec(self): embedding = HalfVector([1.5, 2, 3]) @@ -182,7 +182,7 @@ def configure(conn): with pool.connection() as conn: res = conn.execute("SELECT '[1,2,3]'::vector").fetchone() - assert np.array_equal(res[0], np.array([1, 2, 3])) + assert np.array_equal(res[0], [1, 2, 3]) pool.close() @@ -218,6 +218,6 @@ async def configure(conn): async with conn.cursor() as cur: await cur.execute("SELECT '[1,2,3]'::vector") res = await cur.fetchone() - assert np.array_equal(res[0], np.array([1, 2, 3])) + assert np.array_equal(res[0], [1, 2, 3]) await pool.close() diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 71e0015..8f56ef5 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -122,7 +122,7 @@ def test_pool(self): cur = conn.cursor() cur.execute("SELECT '[1,2,3]'::vector") res = cur.fetchone() - assert np.array_equal(res[0], np.array([1, 2, 3])) + assert np.array_equal(res[0], [1, 2, 3]) finally: pool.putconn(conn) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index cf5b016..dff03dd 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -9,7 +9,7 @@ class TestSparseVector: def test_list(self): vec = SparseVector([1, 0, 2, 0, 3, 0]) assert vec.to_list() == [1, 0, 2, 0, 3, 0] - assert vec.to_numpy().tolist() == [1, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), [1, 0, 2, 0, 3, 0]) assert vec.indices() == [0, 2, 4] def test_list_dimensions(self): @@ -69,7 +69,7 @@ def test_values(self): assert SparseVector([1, 0, 2, 0, 3, 0]).values() == [1, 2, 3] def test_to_coo(self): - assert SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray().tolist() == [[1, 0, 2, 0, 3, 0]] + assert np.array_equal(SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray(), [[1, 0, 2, 0, 3, 0]]) def test_zero_vector_text(self): vec = SparseVector({}, 3) @@ -81,7 +81,7 @@ def test_from_text(self): assert vec.indices() == [0, 2, 4] assert vec.values() == [1.5, 2, 3] assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + assert np.array_equal(vec.to_numpy(), [1.5, 0, 2, 0, 3, 0]) def test_from_binary(self): data = pack('>iii3i3f', 6, 3, 0, 0, 2, 4, 1.5, 2, 3) @@ -90,5 +90,5 @@ def test_from_binary(self): assert vec.indices() == [0, 2, 4] assert vec.values() == [1.5, 2, 3] assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 0, 2, 0, 3, 0])) + assert np.array_equal(vec.to_numpy(), [1.5, 0, 2, 0, 3, 0]) assert vec.to_binary() == data diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 4b1e516..41c309f 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -201,7 +201,7 @@ def test_vector(self, engine): session.add(Item(id=1, embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + assert np.array_equal(item.embedding, [1, 2, 3]) def test_vector_l2_distance(self, engine): create_items() @@ -509,7 +509,7 @@ def test_automap(self, engine): with Session(engine) as session: session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) item = session.query(AutoItem).first() - assert item.embedding.tolist() == [1, 2, 3] + assert np.array_equal(item.embedding, [1, 2, 3]) def test_half_precision(self, engine): create_items() @@ -541,8 +541,8 @@ def test_vector_array(self, engine): # this fails if the driver does not cast arrays item = session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) def test_halfvec_array(self, engine): with Session(engine) as session: @@ -621,7 +621,7 @@ async def test_avg(self, engine): session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) res = await session.scalars(select(avg(Item.embedding))) - assert res.first().tolist() == [2.5, 3.5, 4.5] + assert np.array_equal(res.first(), [2.5, 3.5, 4.5]) await engine.dispose() @@ -639,12 +639,12 @@ async def test_vector_array(self, engine): async with session.begin(): session.add(Item(id=1, embeddings=[Vector([1, 2, 3]), Vector([4, 5, 6])])) item = await session.get(Item, 1) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) session.add(Item(id=2, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) item = await session.get(Item, 2) - assert item.embeddings[0].tolist() == [1, 2, 3] - assert item.embeddings[1].tolist() == [4, 5, 6] + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) await engine.dispose() diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py index 8a472b1..f4994f4 100644 --- a/tests/test_sqlmodel.py +++ b/tests/test_sqlmodel.py @@ -76,7 +76,7 @@ def test_vector(self): session.add(Item(id=1, embedding=[1, 2, 3])) session.commit() item = session.get(Item, 1) - assert item.embedding.tolist() == [1, 2, 3] + assert np.array_equal(item.embedding, np.array([1, 2, 3])) def test_vector_l2_distance(self): create_items() diff --git a/tests/test_vector.py b/tests/test_vector.py index c367a7a..e5a16fe 100644 --- a/tests/test_vector.py +++ b/tests/test_vector.py @@ -49,11 +49,11 @@ def test_dimensions(self): def test_from_text(self): vec = Vector.from_text('[1.5,2,3]') assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) def test_from_binary(self): data = pack('>HH3f', 3, 0, 1.5, 2, 3) vec = Vector.from_binary(data) assert vec.to_list() == [1.5, 2, 3] - assert np.array_equal(vec.to_numpy(), np.array([1.5, 2, 3])) + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) assert vec.to_binary() == data From 057eff226bdb992ebdd952628bf3d54996d9437d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 19:38:31 -0800 Subject: [PATCH 388/424] Improved tests [skip ci] --- tests/test_psycopg2.py | 6 ++---- tests/test_sqlalchemy.py | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 8f56ef5..3e5c8c3 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -82,8 +82,7 @@ def test_halfvec_array(self): cur.execute('SELECT half_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0] == HalfVector([1.5, 2, 3]) - assert res[0][1] == HalfVector([4.5, 5, 6]) + assert res[0] == [HalfVector([1.5, 2, 3]), HalfVector([4.5, 5, 6])] def test_sparsevec_array(self): embeddings = [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] @@ -91,8 +90,7 @@ def test_sparsevec_array(self): cur.execute('SELECT sparse_embeddings FROM psycopg2_items ORDER BY id') res = cur.fetchone() - assert res[0][0] == SparseVector([1.5, 2, 3]) - assert res[0][1] == SparseVector([4.5, 5, 6]) + assert res[0] == [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] def test_cursor_factory(self): for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 41c309f..0d8d1ca 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -551,8 +551,7 @@ def test_halfvec_array(self, engine): # this fails if the driver does not cast arrays item = session.get(Item, 1) - assert item.half_embeddings[0] == HalfVector([1, 2, 3]) - assert item.half_embeddings[1] == HalfVector([4, 5, 6]) + assert item.half_embeddings == [HalfVector([1, 2, 3]), HalfVector([4, 5, 6])] @pytest.mark.parametrize('engine', async_engines) From 8443ff519ac39a9f0b9b2c7233b33accbe6f63ae Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 10 Feb 2025 20:05:01 -0800 Subject: [PATCH 389/424] Added missing dependency for example [skip ci] --- examples/implicit/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/implicit/requirements.txt b/examples/implicit/requirements.txt index 8f04b58..424abbd 100644 --- a/examples/implicit/requirements.txt +++ b/examples/implicit/requirements.txt @@ -1,3 +1,4 @@ +h5py implicit pgvector psycopg[binary] From 2496340bc5e91a0b5cad2462f276c7b488f2e36a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Feb 2025 16:45:35 -0800 Subject: [PATCH 390/424] Added support for pg8000 --- CHANGELOG.md | 1 + README.md | 48 ++++++++++++++++++++++++++++- pgvector/pg8000/__init__.py | 5 ++++ pgvector/pg8000/register.py | 23 ++++++++++++++ tests/test_pg8000.py | 60 +++++++++++++++++++++++++++++++++++++ 5 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 pgvector/pg8000/__init__.py create mode 100644 pgvector/pg8000/register.py create mode 100644 tests/test_pg8000.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f53a2ce..ebc165a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.0 (unreleased) - Added top-level `pgvector` package +- Added support for pg8000 - Changed `globally` option to default to `False` for Psycopg 2 - Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes diff --git a/README.md b/README.md index 5a59c9d..7f980bd 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [pgvector](https://github.com/pgvector/pgvector) support for Python -Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), and [Peewee](https://github.com/coleifer/peewee) +Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), [pg8000](https://github.com/tlocke/pg8000), and [Peewee](https://github.com/coleifer/peewee) [![Build Status](https://github.com/pgvector/pgvector-python/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-python/actions) @@ -22,6 +22,7 @@ And follow the instructions for your database library: - [Psycopg 3](#psycopg-3) - [Psycopg 2](#psycopg-2) - [asyncpg](#asyncpg) +- [pg8000](#pg8000) [unreleased] - [Peewee](#peewee) Or check out some examples: @@ -562,6 +563,51 @@ await conn.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +## pg8000 + +Enable the extension + +```python +conn.run('CREATE EXTENSION IF NOT EXISTS vector') +``` + +Register the vector type with your connection + +```python +from pgvector.pg8000 import register_vector + +register_vector(conn) +``` + +Create a table + +```python +conn.run('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +``` + +Insert a vector + +```python +embedding = np.array([1, 2, 3]) +conn.run('INSERT INTO items (embedding) VALUES (:embedding)', embedding=embedding) +``` + +Get the nearest neighbors to a vector + +```python +conn.run('SELECT * FROM items ORDER BY embedding <-> :embedding LIMIT 5', embedding=embedding) +``` + +Add an approximate index + +```python +conn.run('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +conn.run('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Peewee Add a vector column diff --git a/pgvector/pg8000/__init__.py b/pgvector/pg8000/__init__.py new file mode 100644 index 0000000..b3b4440 --- /dev/null +++ b/pgvector/pg8000/__init__.py @@ -0,0 +1,5 @@ +from .register import register_vector + +__all__ = [ + 'register_vector' +] diff --git a/pgvector/pg8000/register.py b/pgvector/pg8000/register.py new file mode 100644 index 0000000..15ee219 --- /dev/null +++ b/pgvector/pg8000/register.py @@ -0,0 +1,23 @@ +import numpy as np +from .. import Vector, HalfVector, SparseVector + + +def register_vector(conn): + # use to_regtype to get first matching type in search path + res = conn.run("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") + type_info = dict(res) + + if 'vector' not in type_info: + raise RuntimeError('vector type not found in the database') + + conn.register_out_adapter(Vector, Vector._to_db) + conn.register_out_adapter(np.ndarray, Vector._to_db) + conn.register_in_adapter(type_info['vector'], Vector._from_db) + + if 'halfvec' in type_info: + conn.register_out_adapter(HalfVector, HalfVector._to_db) + conn.register_in_adapter(type_info['halfvec'], HalfVector._from_db) + + if 'sparsevec' in type_info: + conn.register_out_adapter(SparseVector, SparseVector._to_db) + conn.register_in_adapter(type_info['sparsevec'], SparseVector._from_db) diff --git a/tests/test_pg8000.py b/tests/test_pg8000.py new file mode 100644 index 0000000..86c0fb1 --- /dev/null +++ b/tests/test_pg8000.py @@ -0,0 +1,60 @@ +import numpy as np +import os +from pgvector import HalfVector, SparseVector, Vector +from pgvector.pg8000 import register_vector +from pg8000.native import Connection + +conn = Connection(os.environ["USER"], database='pgvector_python_test') + +conn.run('CREATE EXTENSION IF NOT EXISTS vector') +conn.run('DROP TABLE IF EXISTS pg8000_items') +conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') + +register_vector(conn) + + +class TestPg8000: + def setup_method(self): + conn.run('DELETE FROM pg8000_items') + + def test_vector(self): + embedding = np.array([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT embedding FROM pg8000_items ORDER BY id') + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_vector_class(self): + embedding = Vector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT embedding FROM pg8000_items ORDER BY id') + assert np.array_equal(res[0][0], embedding.to_numpy()) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_halfvec(self): + embedding = HalfVector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (half_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT half_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == embedding + assert res[1][0] is None + + def test_bit(self): + embedding = '101' + conn.run('INSERT INTO pg8000_items (binary_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT binary_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == '101' + assert res[1][0] is None + + def test_sparsevec(self): + embedding = SparseVector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (sparse_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT sparse_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == embedding + assert res[1][0] is None From df1766b7f9ed6320958c04caf7f1b832d5320e4b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Feb 2025 16:59:42 -0800 Subject: [PATCH 391/424] Simplified test code [skip ci] --- tests/test_pg8000.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pg8000.py b/tests/test_pg8000.py index 86c0fb1..4d3e474 100644 --- a/tests/test_pg8000.py +++ b/tests/test_pg8000.py @@ -8,7 +8,7 @@ conn.run('CREATE EXTENSION IF NOT EXISTS vector') conn.run('DROP TABLE IF EXISTS pg8000_items') -conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') +conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') register_vector(conn) From 70ff5d4765bb156a45d806d3cd171b3a38f03fca Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 14 Feb 2025 17:00:58 -0800 Subject: [PATCH 392/424] Improved tests [skip ci] --- tests/test_psycopg2.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index 3e5c8c3..7f4932d 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -49,6 +49,15 @@ def test_halfvec(self): assert res[0][0] == HalfVector([1.5, 2, 3]) assert res[1][0] is None + def test_halfvec_class(self): + embedding = HalfVector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == embedding + assert res[1][0] is None + def test_bit(self): embedding = '101' cur.execute('INSERT INTO psycopg2_items (binary_embedding) VALUES (%s), (NULL)', (embedding,)) From ac9e398f511ca65f11f62f4296e94f2106367936 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 16 Feb 2025 13:15:19 -0800 Subject: [PATCH 393/424] Improved examples [skip ci] --- examples/cohere/example.py | 6 +++--- examples/openai/example.py | 25 ++++++++++++++--------- examples/sentence_transformers/example.py | 14 ++++++------- examples/sparse_search/example.py | 6 +++--- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/examples/cohere/example.py b/examples/cohere/example.py index 780352a..393d1e0 100644 --- a/examples/cohere/example.py +++ b/examples/cohere/example.py @@ -12,7 +12,7 @@ conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1024))') -def fetch_embeddings(input, input_type): +def embed(input, input_type): co = cohere.Client() response = co.embed(texts=input, model='embed-english-v3.0', input_type=input_type, embedding_types=['ubinary']) return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] @@ -23,12 +23,12 @@ def fetch_embeddings(input, input_type): 'The cat is purring', 'The bear is growling' ] -embeddings = fetch_embeddings(input, 'search_document') +embeddings = embed(input, 'search_document') for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, Bit(embedding))) query = 'forest' -query_embedding = fetch_embeddings([query], 'search_query')[0] +query_embedding = embed([query], 'search_query')[0] result = conn.execute('SELECT content FROM documents ORDER BY embedding <~> %s LIMIT 5', (Bit(query_embedding),)).fetchall() for row in result: print(row[0]) diff --git a/examples/openai/example.py b/examples/openai/example.py index ebed3d0..b9a078c 100644 --- a/examples/openai/example.py +++ b/examples/openai/example.py @@ -1,3 +1,4 @@ +import numpy as np from openai import OpenAI from pgvector.psycopg import register_vector import psycopg @@ -10,20 +11,24 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))') + +def embed(input): + client = OpenAI() + response = client.embeddings.create(input=input, model='text-embedding-3-small') + return [v.embedding for v in response.data] + + input = [ 'The dog is barking', 'The cat is purring', 'The bear is growling' ] - -client = OpenAI() -response = client.embeddings.create(input=input, model='text-embedding-3-small') -embeddings = [v.embedding for v in response.data] - +embeddings = embed(input) for content, embedding in zip(input, embeddings): - conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, np.array(embedding))) -document_id = 1 -neighbors = conn.execute('SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5', {'id': document_id}).fetchall() -for neighbor in neighbors: - print(neighbor[0]) +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (np.array(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/sentence_transformers/example.py b/examples/sentence_transformers/example.py index d4e7f96..3a7dca5 100644 --- a/examples/sentence_transformers/example.py +++ b/examples/sentence_transformers/example.py @@ -10,19 +10,19 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))') +model = SentenceTransformer('all-MiniLM-L6-v2') + input = [ 'The dog is barking', 'The cat is purring', 'The bear is growling' ] - -model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(input) - for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) -document_id = 1 -neighbors = conn.execute('SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5', {'id': document_id}).fetchall() -for neighbor in neighbors: - print(neighbor[0]) +query = 'forest' +query_embedding = model.encode(query) +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (query_embedding,)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/sparse_search/example.py b/examples/sparse_search/example.py index fa6074e..2b5daea 100644 --- a/examples/sparse_search/example.py +++ b/examples/sparse_search/example.py @@ -20,7 +20,7 @@ special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()] -def fetch_embeddings(input): +def embed(input): feature = tokenizer( input, padding=True, @@ -42,12 +42,12 @@ def fetch_embeddings(input): 'The cat is purring', 'The bear is growling' ] -embeddings = fetch_embeddings(input) +embeddings = embed(input) for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector(embedding))) query = 'forest' -query_embedding = fetch_embeddings([query])[0] +query_embedding = embed([query])[0] result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector(query_embedding),)).fetchall() for row in result: print(row[0]) From 1443c3c3ca11b9efadb07612758c2ba62fb4ec65 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 16 Feb 2025 13:50:50 -0800 Subject: [PATCH 394/424] Added halfvec example for OpenAI [skip ci] --- examples/openai/halfvec.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples/openai/halfvec.py diff --git a/examples/openai/halfvec.py b/examples/openai/halfvec.py new file mode 100644 index 0000000..185c785 --- /dev/null +++ b/examples/openai/halfvec.py @@ -0,0 +1,34 @@ +from openai import OpenAI +from pgvector.psycopg import register_vector, HalfVector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding halfvec(3072))') +conn.execute('CREATE INDEX ON documents USING hnsw (embedding halfvec_cosine_ops)') + + +def embed(input): + client = OpenAI() + response = client.embeddings.create(input=input, model='text-embedding-3-large') + return [v.embedding for v in response.data] + + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = embed(input) +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, HalfVector(embedding))) + +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (HalfVector(query_embedding),)).fetchall() +for row in result: + print(row[0]) From 12146d74db24514831138b43ec69273e289cde1a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 16 Feb 2025 18:34:41 -0800 Subject: [PATCH 395/424] Improved example [skip ci] --- examples/sentence_transformers/example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sentence_transformers/example.py b/examples/sentence_transformers/example.py index 3a7dca5..50997d9 100644 --- a/examples/sentence_transformers/example.py +++ b/examples/sentence_transformers/example.py @@ -10,7 +10,7 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))') -model = SentenceTransformer('all-MiniLM-L6-v2') +model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') input = [ 'The dog is barking', From 78466224ec95a38441240753f090625056b87b1e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 19 Feb 2025 15:54:25 -0800 Subject: [PATCH 396/424] Added reference section to readme [skip ci] --- README.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/README.md b/README.md index 7f980bd..299753e 100644 --- a/README.md +++ b/README.md @@ -665,6 +665,99 @@ Item.add_index('embedding vector_l2_ops', using='hnsw') Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +## Reference + +### Half Vectors + +Create a half vector from a list + +```python +vec = HalfVector([1, 2, 3]) +``` + +Or a NumPy array + +```python +vec = HalfVector(np.array([1, 2, 3])) +``` + +Get a list + +```python +lst = vec.to_list() +``` + +Get a NumPy array + +```python +arr = vec.to_numpy() +``` + +### Sparse Vectors + +Create a sparse vector from a list + +```python +vec = SparseVector([1, 0, 2, 0, 3, 0]) +``` + +Or a NumPy array + +```python +vec = SparseVector(np.array([1, 0, 2, 0, 3, 0])) +``` + +Or a SciPy sparse array + +```python +arr = coo_array(([1, 2, 3], ([0, 2, 4],)), shape=(6,)) +vec = SparseVector(arr) +``` + +Or a dictionary of non-zero elements + +```python +vec = SparseVector({0: 1, 2: 2, 4: 3}, 6) +``` + +Note: Indices start at 0 + +Get the number of dimensions + +```python +dim = vec.dimensions() +``` + +Get the indices of non-zero elements + +```python +indices = vec.indices() +``` + +Get the values of non-zero elements + +```python +values = vec.values() +``` + +Get a list + +```python +lst = vec.to_list() +``` + +Get a NumPy array + +```python +arr = vec.to_numpy() +``` + +Get a SciPy sparse array + +```python +arr = vec.to_coo() +``` + ## History View the [changelog](https://github.com/pgvector/pgvector-python/blob/master/CHANGELOG.md) From ac1a543ab33a09efa2758f0179cea6a89257b601 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:07:29 -0700 Subject: [PATCH 397/424] Improved validation for Bit constructor --- pgvector/bit.py | 4 +++- tests/test_bit.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 4be7385..9a890a1 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -13,7 +13,9 @@ def __init__(self, value): elif value.dtype != np.bool: raise ValueError('expected dtype to be bool or uint8') else: - value = np.asarray(value, dtype=bool) + value = np.asarray(value) + if value.dtype != np.bool: + raise ValueError('expected dtype to be bool') if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 5e1bff2..0c661d0 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -7,6 +7,11 @@ class TestBit: def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] + def test_list_int(self): + with pytest.raises(ValueError) as error: + Bit([254, 7, 0]) + assert str(error.value) == 'expected dtype to be bool' + def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] From 900cbb38370eebfeebdd519482cfd1a30cf6e937 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:14:46 -0700 Subject: [PATCH 398/424] Improved error message --- pgvector/bit.py | 2 +- tests/test_bit.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 9a890a1..a8feb55 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,7 @@ def __init__(self, value): else: value = np.asarray(value) if value.dtype != np.bool: - raise ValueError('expected dtype to be bool') + raise ValueError('expected all elements to be boolean') if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 0c661d0..ae27359 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -7,10 +7,15 @@ class TestBit: def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] + def test_list_none(self): + with pytest.raises(ValueError) as error: + Bit([True, None, True]) + assert str(error.value) == 'expected all elements to be boolean' + def test_list_int(self): with pytest.raises(ValueError) as error: Bit([254, 7, 0]) - assert str(error.value) == 'expected dtype to be bool' + assert str(error.value) == 'expected all elements to be boolean' def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] From 534ec18683d4c5e3058ba14d7810d0d5df7d8c55 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:34:18 -0700 Subject: [PATCH 399/424] Added support for bytes to Bit constructor --- CHANGELOG.md | 1 + pgvector/bit.py | 2 ++ tests/test_bit.py | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebc165a..89e955a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Added top-level `pgvector` package - Added support for pg8000 +- Added support for `bytes` to `Bit` constructor - Changed `globally` option to default to `False` for Psycopg 2 - Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes diff --git a/pgvector/bit.py b/pgvector/bit.py index a8feb55..8766f65 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -6,6 +6,8 @@ class Bit: def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value + elif isinstance(value, bytes): + self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) else: if isinstance(value, np.ndarray): if value.dtype == np.uint8: diff --git a/tests/test_bit.py b/tests/test_bit.py index ae27359..571205f 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -23,6 +23,10 @@ def test_tuple(self): def test_str(self): assert Bit('101').to_list() == [True, False, True] + def test_bytes(self): + assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] + assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) assert Bit(arr).to_text() == '111111100000011100000000' From 2d1b754773f8c4f41970b3f61b93b20460961f98 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:54:26 -0700 Subject: [PATCH 400/424] Restored backwards compatibility of Bit constructor --- pgvector/bit.py | 15 ++++++--------- tests/test_bit.py | 18 ++++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 8766f65..935f0f0 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -1,5 +1,6 @@ import numpy as np from struct import pack, unpack_from +from warnings import warn class Bit: @@ -9,15 +10,11 @@ def __init__(self, value): elif isinstance(value, bytes): self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) else: - if isinstance(value, np.ndarray): - if value.dtype == np.uint8: - value = np.unpackbits(value).astype(bool) - elif value.dtype != np.bool: - raise ValueError('expected dtype to be bool or uint8') - else: - value = np.asarray(value) - if value.dtype != np.bool: - raise ValueError('expected all elements to be boolean') + value = np.asarray(value) + + if value.dtype != np.bool: + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 571205f..a13f476 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,14 +8,12 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.raises(ValueError) as error: - Bit([True, None, True]) - assert str(error.value) == 'expected all elements to be boolean' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.raises(ValueError) as error: - Bit([254, 7, 0]) - assert str(error.value) == 'expected all elements to be boolean' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] @@ -29,13 +27,13 @@ def test_bytes(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - assert Bit(arr).to_text() == '111111100000011100000000' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.raises(ValueError) as error: - Bit(arr) - assert str(error.value) == 'expected dtype to be bool or uint8' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' def test_ndarray_same_object(self): arr = np.array([True, False, True]) From 2ce3f43e6693fec29e92fa84f7d46fefb96f98f0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 15:35:09 -0700 Subject: [PATCH 401/424] Improved internal representation of Bit class --- pgvector/bit.py | 47 +++++++++++++++++++++++++++-------------------- tests/test_bit.py | 10 +++++----- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 935f0f0..72b8052 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -5,51 +5,58 @@ class Bit: def __init__(self, value): - if isinstance(value, str): - self._value = self.from_text(value)._value - elif isinstance(value, bytes): - self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) + if isinstance(value, bytes): + self._len = 8 * len(value) + self._data = value else: - value = np.asarray(value) + if isinstance(value, str): + value = [v != '0' for v in value] + else: + value = np.asarray(value) - if value.dtype != np.bool: - warn('expected elements to be boolean', stacklevel=2) - value = value.astype(bool) + if value.dtype != np.bool: + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) - if value.ndim != 1: - raise ValueError('expected ndim to be 1') + if value.ndim != 1: + raise ValueError('expected ndim to be 1') - self._value = value + self._len = len(value) + self._data = np.packbits(value).tobytes() def __repr__(self): return f'Bit({self.to_text()})' def __eq__(self, other): if isinstance(other, self.__class__): - return np.array_equal(self.to_numpy(), other.to_numpy()) + return self._len == other._len and self._data == other._data return False def to_list(self): - return self._value.tolist() + return self.to_numpy().tolist() def to_numpy(self): - return self._value + return np.unpackbits(np.frombuffer(self._data, dtype=np.uint8), count=self._len).astype(bool) def to_text(self): - return ''.join(self._value.astype(np.uint8).astype(str)) + return ''.join(format(v, '08b') for v in self._data)[:self._len] def to_binary(self): - return pack('>i', len(self._value)) + np.packbits(self._value).tobytes() + return pack('>i', self._len) + self._data @classmethod def from_text(cls, value): - return cls(np.asarray([v != '0' for v in value], dtype=bool)) + return cls(str(value)) @classmethod def from_binary(cls, value): - count = unpack_from('>i', value)[0] - buf = np.frombuffer(value, dtype=np.uint8, offset=4) - return cls(np.unpackbits(buf, count=count).astype(bool)) + if not isinstance(value, bytes): + raise ValueError('expected bytes') + + bit = cls.__new__(cls) + bit._len = unpack_from('>i', value)[0] + bit._data = value[4:] + return bit @classmethod def _to_db(cls, value): diff --git a/tests/test_bit.py b/tests/test_bit.py index a13f476..cf1275e 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -25,6 +25,11 @@ def test_bytes(self): assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + def test_ndarray(self): + arr = np.array([True, False, True]) + assert Bit(arr).to_list() == [True, False, True] + assert np.array_equal(Bit(arr).to_numpy(), arr) + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) with pytest.warns(UserWarning, match='expected elements to be boolean'): @@ -35,11 +40,6 @@ def test_ndarray_uint16(self): with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' - def test_ndarray_same_object(self): - arr = np.array([True, False, True]) - assert Bit(arr).to_list() == [True, False, True] - assert Bit(arr).to_numpy() is arr - def test_ndim_two(self): with pytest.raises(ValueError) as error: Bit([[True, False], [True, False]]) From c2c17c2ab6365e55677bde47d1d13c63b4e87642 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:02:46 -0700 Subject: [PATCH 402/424] Removed warning for result of np.unpackbits --- pgvector/bit.py | 4 +++- tests/test_bit.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 72b8052..edfaec6 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,9 @@ def __init__(self, value): value = np.asarray(value) if value.dtype != np.bool: - warn('expected elements to be boolean', stacklevel=2) + # allow result of np.unpackbits + if value.dtype != np.uint8 or np.any(value > 1): + warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index cf1275e..ef049c7 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -30,6 +30,10 @@ def test_ndarray(self): assert Bit(arr).to_list() == [True, False, True] assert np.array_equal(Bit(arr).to_numpy(), arr) + def test_ndarray_unpackbits(self): + arr = np.unpackbits(np.array([254, 7, 0], dtype=np.uint8)) + assert Bit(arr).to_text() == '111111100000011100000000' + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) with pytest.warns(UserWarning, match='expected elements to be boolean'): From 50fac76f7959a155444e46d9e11be42403b09b26 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:04:10 -0700 Subject: [PATCH 403/424] Improved test --- tests/test_bit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_bit.py b/tests/test_bit.py index ef049c7..5a71642 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -22,8 +22,8 @@ def test_str(self): assert Bit('101').to_list() == [True, False, True] def test_bytes(self): - assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] - assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + assert Bit(b'\xff\x00\xf0').to_text() == '111111110000000011110000' + assert Bit(b'\xfe\x07\x00').to_text() == '111111100000011100000000' def test_ndarray(self): arr = np.array([True, False, True]) From 92bb02a531fc012369ee20f065028aec230d5dcf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:05:17 -0700 Subject: [PATCH 404/424] Updated comment [skip ci] --- pgvector/bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index edfaec6..26a9d8d 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,7 @@ def __init__(self, value): value = np.asarray(value) if value.dtype != np.bool: - # allow result of np.unpackbits + # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) From 4e22f9b26545f1b871cfba0fde21812ebc88ca84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:16:01 -0700 Subject: [PATCH 405/424] Updated warning message --- pgvector/bit.py | 2 +- tests/test_bit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 26a9d8d..e82b325 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -17,7 +17,7 @@ def __init__(self, value): if value.dtype != np.bool: # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): - warn('expected elements to be boolean', stacklevel=2) + warn('elements should be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index 5a71642..e920228 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,11 +8,11 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): @@ -36,12 +36,12 @@ def test_ndarray_unpackbits(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit(arr).to_text() == '110' def test_ndim_two(self): From 7a2dd806e79ad82960cc1a89159ca61f9a12a373 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:20:17 -0700 Subject: [PATCH 406/424] Revert "Updated warning message" This reverts commit 4e22f9b26545f1b871cfba0fde21812ebc88ca84. --- pgvector/bit.py | 2 +- tests/test_bit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index e82b325..26a9d8d 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -17,7 +17,7 @@ def __init__(self, value): if value.dtype != np.bool: # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): - warn('elements should be boolean', stacklevel=2) + warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index e920228..5a71642 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,11 +8,11 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): @@ -36,12 +36,12 @@ def test_ndarray_unpackbits(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' def test_ndim_two(self): From 6bb6df8cce6d5b03e1a8a9b683ae37faaf12db7a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:35:04 -0700 Subject: [PATCH 407/424] Removed unreleased import --- pgvector/psycopg2/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index b40c673..33e5124 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,11 +1,10 @@ from .register import register_vector # TODO remove -from .. import HalfVector, SparseVector, Vector +from .. import HalfVector, SparseVector __all__ = [ 'register_vector', - 'Vector', 'HalfVector', 'SparseVector' ] From a8f2a5f8428ae10d79be53c0367fc007eca4ab78 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 17:53:02 -0700 Subject: [PATCH 408/424] Version bump to 0.4.0 [skip ci] --- CHANGELOG.md | 2 +- README.md | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89e955a..d0e2730 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.0 (unreleased) +## 0.4.0 (2025-03-15) - Added top-level `pgvector` package - Added support for pg8000 diff --git a/README.md b/README.md index 299753e..b6bc055 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ And follow the instructions for your database library: - [Psycopg 3](#psycopg-3) - [Psycopg 2](#psycopg-2) - [asyncpg](#asyncpg) -- [pg8000](#pg8000) [unreleased] +- [pg8000](#pg8000) - [Peewee](#peewee) Or check out some examples: diff --git a/pyproject.toml b/pyproject.toml index 0f291f5..b889f4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.3.6" +version = "0.4.0" description = "pgvector support for Python" readme = "README.md" authors = [ From e19df465f0745aef4240f5388b5ca765137397be Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 17 Mar 2025 15:58:59 -0700 Subject: [PATCH 409/424] Added basic RAG example [skip ci] --- .gitignore | 1 + README.md | 1 + examples/rag/example.py | 65 +++++++++++++++++++++++++++++++++++ examples/rag/requirements.txt | 3 ++ 4 files changed, 70 insertions(+) create mode 100644 examples/rag/example.py create mode 100644 examples/rag/requirements.txt diff --git a/.gitignore b/.gitignore index f7ff659..c55ff44 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv/ *.pyc __pycache__ .pytest_cache/ +examples/rag/README.md diff --git a/README.md b/README.md index b6bc055..24d9bb9 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ And follow the instructions for your database library: Or check out some examples: +- [Retrieval-augmented generation](https://github.com/pgvector/pgvector-python/blob/master/examples/rag/example.py) with Ollama - [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai/example.py) with OpenAI - [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere/example.py) with Cohere - [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_transformers/example.py) with SentenceTransformers diff --git a/examples/rag/example.py b/examples/rag/example.py new file mode 100644 index 0000000..4d5d307 --- /dev/null +++ b/examples/rag/example.py @@ -0,0 +1,65 @@ +# Run: +# ollama pull llama3.2 +# ollama pull nomic-embed-text +# ollama serve + +import numpy as np +import ollama +from pathlib import Path +from pgvector.psycopg import register_vector +import psycopg +import urllib.request + +query = 'What index types are supported?' +load_data = True + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +if load_data: + # get data + url = 'https://raw.githubusercontent.com/pgvector/pgvector/refs/heads/master/README.md' + dest = Path(__file__).parent / 'README.md' + if not dest.exists(): + urllib.request.urlretrieve(url, dest) + + with open(dest, encoding='utf-8') as f: + doc = f.read() + + # generate chunks + # TODO improve chunking + # TODO remove markdown + chunks = doc.split('\n## ') + + # embed chunks + # nomic-embed-text has task instruction prefix + input = ['search_document: ' + chunk for chunk in chunks] + embeddings = ollama.embed(model='nomic-embed-text', input=input).embeddings + + # create table + conn.execute('DROP TABLE IF EXISTS chunks') + conn.execute('CREATE TABLE chunks (id bigserial PRIMARY KEY, content text, embedding vector(768))') + + # store chunks + cur = conn.cursor() + with cur.copy('COPY chunks (content, embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['text', 'vector']) + + for content, embedding in zip(chunks, embeddings): + copy.write_row([content, embedding]) + +# embed query +# nomic-embed-text has task instruction prefix +input = 'search_query: ' + query +embedding = ollama.embed(model='nomic-embed-text', input=input).embeddings[0] + +# retrieve chunks +result = conn.execute('SELECT content FROM chunks ORDER BY embedding <=> %s LIMIT 5', (np.array(embedding),)).fetchall() +context = '\n\n'.join([row[0] for row in result]) + +# get answer +# TODO improve prompt +prompt = f'Answer this question: {query}\n\n{context}' +response = ollama.generate(model='llama3.2', prompt=prompt).response +print(response) diff --git a/examples/rag/requirements.txt b/examples/rag/requirements.txt new file mode 100644 index 0000000..4eb5864 --- /dev/null +++ b/examples/rag/requirements.txt @@ -0,0 +1,3 @@ +ollama +pgvector +psycopg[binary] From 1901b9cc8ab1eaf3a7415e3424509381a3399ccc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 24 Mar 2025 01:20:18 -0700 Subject: [PATCH 410/424] Improved test [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0d8d1ca..5aec977 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -43,7 +43,7 @@ def psycopg_connect(dbapi_connection, connection_record): psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): + def psycopg_async_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector_async dbapi_connection.run_async(register_vector_async) @@ -51,7 +51,7 @@ def connect(dbapi_connection, connection_record): asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') @event.listens_for(asyncpg_type_engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): + def asyncpg_connect(dbapi_connection, connection_record): from pgvector.asyncpg import register_vector dbapi_connection.run_async(register_vector) From eb654016181b69e9ed06871c39d8df329614cb66 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:42:07 -0700 Subject: [PATCH 411/424] Added ColBERT example for approximate search - #123 [skip ci] --- examples/colbert/approximate.py | 75 +++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 examples/colbert/approximate.py diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py new file mode 100644 index 0000000..0508d0f --- /dev/null +++ b/examples/colbert/approximate.py @@ -0,0 +1,75 @@ +# approach from section 3.6 in https://arxiv.org/abs/2004.12832 + +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('DROP TABLE IF EXISTS document_embeddings') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') +conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + with conn.transaction(): + result = conn.execute('INSERT INTO documents (content) VALUES (%s) RETURNING id', (content,)).fetchone() + params = [] + for embedding in embeddings: + params.extend([result[0], embedding.numpy()]) + values = ', '.join(['(%s, %s)' for _ in embeddings]) + conn.execute(f'INSERT INTO document_embeddings (document_id, embedding) VALUES {values}', params) + +conn.execute('CREATE INDEX ON document_embeddings (document_id)') +conn.execute('CREATE INDEX ON document_embeddings USING hnsw (embedding vector_cosine_ops)') + +query = 'puppy' +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query])[0]] +approximate_stage = ' UNION ALL '.join(['(SELECT document_id FROM document_embeddings ORDER BY embedding <=> %s LIMIT 5)' for _ in query_embeddings]) +sql = f""" +WITH approximate_stage AS ( + {approximate_stage} +), +embeddings AS ( + SELECT document_id, array_agg(embedding) AS embeddings FROM document_embeddings + WHERE document_id IN (SELECT DISTINCT document_id FROM approximate_stage) + GROUP BY document_id +) +SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents +INNER JOIN embeddings ON embeddings.document_id = documents.id +ORDER BY max_sim DESC LIMIT 10 +""" +params = [v for v in query_embeddings] + [query_embeddings] +result = conn.execute(sql, params).fetchall() +for row in result: + print(row) From 8718cdde9f91490b39a06293ec48d8f26193334b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:51:47 -0700 Subject: [PATCH 412/424] Updated comment [skip ci] --- examples/colbert/approximate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 0508d0f..fc1d396 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -1,4 +1,4 @@ -# approach from section 3.6 in https://arxiv.org/abs/2004.12832 +# based on section 3.6 of https://arxiv.org/abs/2004.12832 from colbert.infra import ColBERTConfig from colbert.modeling.checkpoint import Checkpoint From 123f74343b03a7910b8b66de4fc33127f4696430 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:53:10 -0700 Subject: [PATCH 413/424] Improved example [skip ci] --- examples/colbert/approximate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index fc1d396..290e66d 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -12,8 +12,10 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('DROP TABLE IF EXISTS document_embeddings') + conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') + conn.execute(""" CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ WITH queries AS ( @@ -69,7 +71,7 @@ INNER JOIN embeddings ON embeddings.document_id = documents.id ORDER BY max_sim DESC LIMIT 10 """ -params = [v for v in query_embeddings] + [query_embeddings] +params = query_embeddings + [query_embeddings] result = conn.execute(sql, params).fetchall() for row in result: print(row) From bef31a81ced1517f33c5fd960e7ba10f2fd5d8e2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:02:35 -0700 Subject: [PATCH 414/424] Improved ColBERT examples [skip ci] --- examples/colbert/approximate.py | 4 ++++ examples/colbert/exact.py | 4 ++++ examples/colbert/requirements.txt | 1 + 3 files changed, 9 insertions(+) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 290e66d..623f913 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -4,6 +4,10 @@ from colbert.modeling.checkpoint import Checkpoint from pgvector.psycopg import register_vector import psycopg +import warnings + +# ignore warnings from colbert +warnings.filterwarnings('ignore') conn = psycopg.connect(dbname='pgvector_example', autocommit=True) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index 1c90b47..ceed2e3 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -2,6 +2,10 @@ from colbert.modeling.checkpoint import Checkpoint from pgvector.psycopg import register_vector import psycopg +import warnings + +# ignore warnings from colbert +warnings.filterwarnings('ignore') conn = psycopg.connect(dbname='pgvector_example', autocommit=True) diff --git a/examples/colbert/requirements.txt b/examples/colbert/requirements.txt index 4402ce8..54b2cb9 100644 --- a/examples/colbert/requirements.txt +++ b/examples/colbert/requirements.txt @@ -1,3 +1,4 @@ colbert-ai pgvector psycopg[binary] +transformers==4.49.0 From 208b11a893c6e5a672481847251bc13a72c84165 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:08:09 -0700 Subject: [PATCH 415/424] Improved examples[skip ci] --- examples/colbert/approximate.py | 6 +++--- examples/colbert/exact.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 623f913..14f1ce0 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -6,9 +6,6 @@ import psycopg import warnings -# ignore warnings from colbert -warnings.filterwarnings('ignore') - conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') @@ -38,6 +35,9 @@ $$ LANGUAGE SQL """) +# ignore warnings from colbert +warnings.filterwarnings('ignore') + config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index ceed2e3..c1ca236 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -4,9 +4,6 @@ import psycopg import warnings -# ignore warnings from colbert -warnings.filterwarnings('ignore') - conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') @@ -32,6 +29,9 @@ $$ LANGUAGE SQL """) +# ignore warnings from colbert +warnings.filterwarnings('ignore') + config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) From 6ff9b8997e75632936230829bd557281c49e1891 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:13:23 -0700 Subject: [PATCH 416/424] Updated ColBERT examples [skip ci] --- examples/colbert/approximate.py | 3 +-- examples/colbert/exact.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 14f1ce0..41f88b2 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -35,8 +35,7 @@ $$ LANGUAGE SQL """) -# ignore warnings from colbert -warnings.filterwarnings('ignore') +warnings.filterwarnings('ignore') # ignore warnings from colbert config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index c1ca236..e6a2936 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -29,8 +29,7 @@ $$ LANGUAGE SQL """) -# ignore warnings from colbert -warnings.filterwarnings('ignore') +warnings.filterwarnings('ignore') # ignore warnings from colbert config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) From 3f9e9a20b9f08033e7dc4e61ff4c43b34951d2ec Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Apr 2025 10:01:51 -0700 Subject: [PATCH 417/424] Updated Cohere example [skip ci] --- examples/cohere/example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cohere/example.py b/examples/cohere/example.py index 393d1e0..5ef4eec 100644 --- a/examples/cohere/example.py +++ b/examples/cohere/example.py @@ -9,12 +9,12 @@ register_vector(conn) conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1024))') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))') def embed(input, input_type): - co = cohere.Client() - response = co.embed(texts=input, model='embed-english-v3.0', input_type=input_type, embedding_types=['ubinary']) + co = cohere.ClientV2() + response = co.embed(texts=input, model='embed-v4.0', input_type=input_type, embedding_types=['ubinary']) return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] From 713590a798190b34f4c43c4b097dbd61455113c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:09:36 -0700 Subject: [PATCH 418/424] Fixed SparseVector constructor for SciPy sparse matrices - fixes #127 --- CHANGELOG.md | 4 ++++ pgvector/sparsevec.py | 2 +- tests/test_sparse_vector.py | 14 +++++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e2730..1bbd73c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.1 (unreleased) + +- Fixed `SparseVector` constructor for SciPy sparse matrices + ## 0.4.0 (2025-03-15) - Added top-level `pgvector` package diff --git a/pgvector/sparsevec.py b/pgvector/sparsevec.py index 8df2dfd..895fbd0 100644 --- a/pgvector/sparsevec.py +++ b/pgvector/sparsevec.py @@ -85,7 +85,7 @@ def _from_sparse(self, value): if hasattr(value, 'coords'): # scipy 1.13+ - self._indices = value.coords[0].tolist() + self._indices = value.coords[-1].tolist() else: self._indices = value.col.tolist() self._values = value.data.tolist() diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index dff03dd..933cfff 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,7 +1,7 @@ import numpy as np from pgvector import SparseVector import pytest -from scipy.sparse import coo_array +from scipy.sparse import coo_array, csr_array, csr_matrix from struct import pack @@ -49,6 +49,18 @@ def test_dok_array(self): assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] + def test_csr_array(self): + arr = csr_array(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_csr_matrix(self): + mat = csr_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' From 76afd8ec3013ac58bb6cc60a1b5b705f157ea18b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:15:41 -0700 Subject: [PATCH 419/424] Added test for coo_matrix --- tests/test_sparse_vector.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 933cfff..0cf0a72 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,7 +1,7 @@ import numpy as np from pgvector import SparseVector import pytest -from scipy.sparse import coo_array, csr_array, csr_matrix +from scipy.sparse import coo_array, coo_matrix, csr_array, csr_matrix from struct import pack @@ -43,6 +43,12 @@ def test_coo_array_dimensions(self): SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) assert str(error.value) == 'extra argument' + def test_coo_matrix(self): + mat = coo_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_dok_array(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])).todok() vec = SparseVector(arr) From 809287f92847e1c609a9c395891da76f674379ea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:20:20 -0700 Subject: [PATCH 420/424] Fixed CI --- tests/test_sparse_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 0cf0a72..d580f32 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -56,7 +56,7 @@ def test_dok_array(self): assert vec.indices() == [0, 2, 4] def test_csr_array(self): - arr = csr_array(np.array([1, 0, 2, 0, 3, 0])) + arr = csr_array(np.array([[1, 0, 2, 0, 3, 0]])) vec = SparseVector(arr) assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] From f9d2073df5cce39f0691ead6f9e030516baac7f8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 26 Apr 2025 11:56:00 -0700 Subject: [PATCH 421/424] Version bump to 0.4.1 [skip ci] --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbd73c..0ed80e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.1 (unreleased) +## 0.4.1 (2025-04-26) - Fixed `SparseVector` constructor for SciPy sparse matrices diff --git a/pyproject.toml b/pyproject.toml index b889f4b..9395f9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.4.0" +version = "0.4.1" description = "pgvector support for Python" readme = "README.md" authors = [ From 7793bb069942fbcc2e77cf7349c59ffc28d8b6e0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 21 May 2025 18:16:18 -0700 Subject: [PATCH 422/424] Improved example [skip ci] --- examples/loading/example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/loading/example.py b/examples/loading/example.py index 0702129..7f3dce8 100644 --- a/examples/loading/example.py +++ b/examples/loading/example.py @@ -25,12 +25,12 @@ copy.set_types(['vector']) for i, embedding in enumerate(embeddings): + copy.write_row([embedding]) + # show progress if i % 10000 == 0: print('.', end='', flush=True) - copy.write_row([embedding]) - print('\nSuccess!') # create any indexes *after* loading initial data (skipping for this example) From 91088aacfadad37c9b8ea533b1e2b16b08d12ac4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 8 Jun 2025 16:28:24 -0700 Subject: [PATCH 423/424] Updated readme [skip ci] --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 24d9bb9..7c302b1 100644 --- a/README.md +++ b/README.md @@ -409,7 +409,7 @@ Enable the extension conn.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.psycopg import register_vector @@ -472,7 +472,7 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection or cursor +Register the types with your connection or cursor ```python from pgvector.psycopg2 import register_vector @@ -518,7 +518,7 @@ Enable the extension await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.asyncpg import register_vector @@ -572,7 +572,7 @@ Enable the extension conn.run('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.pg8000 import register_vector From ee3e71ca2c07a12a8332a3877c0ce14adc9a5da8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 16 Jun 2025 15:36:53 -0700 Subject: [PATCH 424/424] Updated format for license identifier --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9395f9e..0cfa183 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" authors = [ {name = "Andrew Kane", email = "andrew@ankane.org"} ] -license = {text = "MIT"} +license = "MIT" requires-python = ">= 3.9" dependencies = [ "numpy"