diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d711cd5..4d4e8ed 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,10 +6,10 @@ jobs: strategy: fail-fast: false matrix: - python: [3.11, 3.8] + python: [3.13, 3.9] steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - run: pip install -r requirements.txt @@ -19,8 +19,11 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.5.0 https://github.com/pgvector/pgvector.git + git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install - run: pytest + + - run: pip install "SQLAlchemy<2" -U + - run: pytest tests/test_sqlalchemy.py diff --git a/.gitignore b/.gitignore index 1d3b727..c55ff44 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ venv/ .cache/ *.pyc __pycache__ +.pytest_cache/ +examples/rag/README.md diff --git a/CHANGELOG.md b/CHANGELOG.md index a8dc947..0ed80e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,65 @@ +## 0.4.1 (2025-04-26) + +- Fixed `SparseVector` constructor for SciPy sparse matrices + +## 0.4.0 (2025-03-15) + +- Added top-level `pgvector` package +- Added support for pg8000 +- Added support for `bytes` to `Bit` constructor +- Changed `globally` option to default to `False` for Psycopg 2 +- Changed `arrays` option to default to `True` for Psycopg 2 +- Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes +- Fixed `indices` and `values` methods of `SparseVector` returning tuple instead of list in some cases +- Dropped support for Python < 3.9 + +## 0.3.6 (2024-10-26) + +- Added `arrays` option for Psycopg 2 + +## 0.3.5 (2024-10-05) + +- Added `avg` function with type casting to SQLAlchemy +- Added `globally` option for Psycopg 2 + +## 0.3.4 (2024-09-26) + +- Added `schema` option for asyncpg + +## 0.3.3 (2024-09-09) + +- Improved support for cursor factories with Psycopg 2 + +## 0.3.2 (2024-07-17) + +- Fixed error with asyncpg and pgvector < 0.7 + +## 0.3.1 (2024-07-10) + +- Fixed error parsing zero sparse vectors +- Fixed error with Psycopg 2 and pgvector < 0.7 +- Fixed error message when `vector` type not found with Psycopg 3 + +## 0.3.0 (2024-06-25) + +- Added support for `halfvec`, `bit`, and `sparsevec` types to Django +- Added support for `halfvec`, `bit`, and `sparsevec` types to SQLAlchemy and SQLModel +- Added support for `halfvec` and `sparsevec` types to Psycopg 3 +- Added support for `halfvec` and `sparsevec` types to Psycopg 2 +- Added support for `halfvec` and `sparsevec` types to asyncpg +- Added support for `halfvec`, `bit`, and `sparsevec` types to Peewee +- Added `L1Distance`, `HammingDistance`, and `JaccardDistance` for Django +- Added `l1_distance`, `hamming_distance`, and `jaccard_distance` for SQLAlchemy and SQLModel +- Added `l1_distance`, `hamming_distance`, and `jaccard_distance` for Peewee + +## 0.2.5 (2024-02-07) + +- Added literal binds support for SQLAlchemy + +## 0.2.4 (2023-11-24) + +- Improved reflection with SQLAlchemy + ## 0.2.3 (2023-09-25) - Fixed null values with Django @@ -23,8 +85,8 @@ ## 0.1.7 (2023-05-11) -- Added `register_vector_async` for psycopg3 -- Fixed `set_types` for psycopg3 +- Added `register_vector_async` for Psycopg 3 +- Fixed `set_types` for Psycopg 3 ## 0.1.6 (2022-05-22) @@ -37,12 +99,12 @@ ## 0.1.4 (2021-10-12) -- Updated psycopg3 integration for 3.0 release (no longer experimental) +- Updated Psycopg 3 integration for 3.0 release (no longer experimental) ## 0.1.3 (2021-06-22) - Added support for asyncpg -- Added experimental support for psycopg3 +- Added experimental support for Psycopg 3 ## 0.1.2 (2021-06-13) diff --git a/LICENSE.txt b/LICENSE.txt index b3134ac..b612d6d 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2021-2023 Andrew Kane +Copyright (c) 2021-2025 Andrew Kane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index 2199d93..f0831c4 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ +.PHONY: lint build publish clean + lint: pycodestyle . --ignore=E501 -publish: clean - python3 setup.py bdist_wheel --universal - ls dist - # twine upload dist/* - make clean +build: + python3 -m build + +publish: clean build + twine upload dist/* clean: - rm -rf .pytest_cache build dist pgvector.egg-info + rm -rf .pytest_cache dist pgvector.egg-info diff --git a/README.md b/README.md index 37030e0..7c302b1 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ [pgvector](https://github.com/pgvector/pgvector) support for Python -Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), and [Peewee](https://github.com/coleifer/peewee) +Supports [Django](https://github.com/django/django), [SQLAlchemy](https://github.com/sqlalchemy/sqlalchemy), [SQLModel](https://github.com/tiangolo/sqlmodel), [Psycopg 3](https://github.com/psycopg/psycopg), [Psycopg 2](https://github.com/psycopg/psycopg2), [asyncpg](https://github.com/MagicStack/asyncpg), [pg8000](https://github.com/tlocke/pg8000), and [Peewee](https://github.com/coleifer/peewee) -[![Build Status](https://github.com/pgvector/pgvector-python/workflows/build/badge.svg?branch=master)](https://github.com/pgvector/pgvector-python/actions) +[![Build Status](https://github.com/pgvector/pgvector-python/actions/workflows/build.yml/badge.svg)](https://github.com/pgvector/pgvector-python/actions) ## Installation @@ -22,17 +22,29 @@ And follow the instructions for your database library: - [Psycopg 3](#psycopg-3) - [Psycopg 2](#psycopg-2) - [asyncpg](#asyncpg) +- [pg8000](#pg8000) - [Peewee](#peewee) Or check out some examples: -- [Embeddings](examples/openai_embeddings.py) with OpenAI -- [Sentence embeddings](examples/sentence_embeddings.py) with SentenceTransformers -- [Hybrid search](examples/hybrid_search.py) with SentenceTransformers -- [Image search](examples/pytorch_image_search.py) with PyTorch -- [Implicit feedback recommendations](examples/implicit_recs.py) with Implicit -- [Explicit feedback recommendations](examples/surprise_recs.py) with Surprise -- [Recommendations](examples/lightfm_recs.py) with LightFM +- [Retrieval-augmented generation](https://github.com/pgvector/pgvector-python/blob/master/examples/rag/example.py) with Ollama +- [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai/example.py) with OpenAI +- [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere/example.py) with Cohere +- [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_transformers/example.py) with SentenceTransformers +- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/rrf.py) with SentenceTransformers (Reciprocal Rank Fusion) +- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder) +- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers +- [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT +- [Visual document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali +- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch +- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing +- [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit +- [Topic modeling](https://github.com/pgvector/pgvector-python/blob/master/examples/gensim/example.py) with Gensim +- [Implicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/implicit/example.py) with Implicit +- [Explicit feedback recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/surprise/example.py) with Surprise +- [Recommendations](https://github.com/pgvector/pgvector-python/blob/master/examples/lightfm/example.py) with LightFM +- [Horizontal scaling](https://github.com/pgvector/pgvector-python/blob/master/examples/citus/example.py) with Citus +- [Bulk loading](https://github.com/pgvector/pgvector-python/blob/master/examples/loading/example.py) with `COPY` ## Django @@ -56,6 +68,8 @@ class Item(models.Model): embedding = VectorField(dimensions=3) ``` +Also supports `HalfVectorField`, `BitField`, and `SparseVectorField` + Insert a vector ```python @@ -71,7 +85,7 @@ from pgvector.django import L2Distance Item.objects.order_by(L2Distance('embedding', [3, 1, 2]))[:5] ``` -Also supports `MaxInnerProduct` and `CosineDistance` +Also supports `MaxInnerProduct`, `CosineDistance`, `L1Distance`, `HammingDistance`, and `JaccardDistance` Get the distance @@ -98,23 +112,23 @@ Also supports `Sum` Add an approximate index ```python -from pgvector.django import IvfflatIndex, HnswIndex +from pgvector.django import HnswIndex, IvfflatIndex class Item(models.Model): class Meta: indexes = [ - IvfflatIndex( + HnswIndex( name='my_index', fields=['embedding'], - lists=100, + m=16, + ef_construction=64, opclasses=['vector_l2_ops'] ), # or - HnswIndex( + IvfflatIndex( name='my_index', fields=['embedding'], - m=16, - ef_construction=64, + lists=100, opclasses=['vector_l2_ops'] ) ] @@ -122,6 +136,36 @@ class Item(models.Model): Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +#### Half-Precision Indexing + +Index vectors at half-precision + +```python +from django.contrib.postgres.indexes import OpClass +from django.db.models.functions import Cast +from pgvector.django import HnswIndex, HalfVectorField + +class Item(models.Model): + class Meta: + indexes = [ + HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='my_index', + m=16, + ef_construction=64 + ) + ] +``` + +Note: Add `'django.contrib.postgres'` to `INSTALLED_APPS` to use `OpClass` + +Get the nearest neighbors + +```python +distance = L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [3, 1, 2]) +Item.objects.order_by(distance)[:5] +``` + ## SQLAlchemy Enable the extension @@ -139,6 +183,8 @@ class Item(Base): embedding = mapped_column(Vector(3)) ``` +Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` + Insert a vector ```python @@ -153,7 +199,7 @@ Get the nearest neighbors to a vector session.scalars(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5)) ``` -Also supports `max_inner_product` and `cosine_distance` +Also supports `max_inner_product`, `cosine_distance`, `l1_distance`, `hamming_distance`, and `jaccard_distance` Get the distance @@ -170,9 +216,9 @@ session.scalars(select(Item).filter(Item.embedding.l2_distance([3, 1, 2]) < 5)) Average vectors ```python -from sqlalchemy.sql import func +from pgvector.sqlalchemy import avg -session.query(func.avg(Item.embedding)).first()[0] +session.scalars(select(avg(Item.embedding))).first() ``` Also supports `sum` @@ -180,15 +226,19 @@ Also supports `sum` Add an approximate index ```python -index = Index('my_index', Item.embedding, - postgresql_using='ivfflat', - postgresql_with={'lists': 100}, +index = Index( + 'my_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'vector_l2_ops'} ) # or -index = Index('my_index', Item.embedding, - postgresql_using='hnsw', - postgresql_with={'m': 16, 'ef_construction': 64}, +index = Index( + 'my_index', + Item.embedding, + postgresql_using='ivfflat', + postgresql_with={'lists': 100}, postgresql_ops={'embedding': 'vector_l2_ops'} ) @@ -197,24 +247,96 @@ index.create(engine) Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +#### Half-Precision Indexing + +Index vectors at half-precision + +```python +from pgvector.sqlalchemy import HALFVEC +from sqlalchemy.sql import func + +index = Index( + 'my_index', + func.cast(Item.embedding, HALFVEC(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'halfvec_l2_ops'} +) +``` + +Get the nearest neighbors + +```python +order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2]) +session.scalars(select(Item).order_by(order).limit(5)) +``` + +#### Arrays + +Add an array column + +```python +from pgvector.sqlalchemy import Vector +from sqlalchemy import ARRAY + +class Item(Base): + embeddings = mapped_column(ARRAY(Vector(3))) +``` + +And register the types with the underlying driver + +For Psycopg 3, use + +```python +from pgvector.psycopg import register_vector +from sqlalchemy import event + +@event.listens_for(engine, "connect") +def connect(dbapi_connection, connection_record): + register_vector(dbapi_connection) +``` + +For [async connections](https://docs.sqlalchemy.org/en/20/orm/extensions/asyncio.html) with Psycopg 3, use + +```python +from pgvector.psycopg import register_vector_async +from sqlalchemy import event + +@event.listens_for(engine.sync_engine, "connect") +def connect(dbapi_connection, connection_record): + dbapi_connection.run_async(register_vector_async) +``` + +For Psycopg 2, use + +```python +from pgvector.psycopg2 import register_vector +from sqlalchemy import event + +@event.listens_for(engine, "connect") +def connect(dbapi_connection, connection_record): + register_vector(dbapi_connection, arrays=True) +``` + ## SQLModel Enable the extension ```python -session.exec('CREATE EXTENSION IF NOT EXISTS vector') +session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) ``` Add a vector column ```python from pgvector.sqlalchemy import Vector -from sqlalchemy import Column class Item(SQLModel, table=True): - embedding: List[float] = Field(sa_column=Column(Vector(3))) + embedding: Any = Field(sa_type=Vector(3)) ``` +Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` + Insert a vector ```python @@ -229,7 +351,55 @@ Get the nearest neighbors to a vector session.exec(select(Item).order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5)) ``` -Also supports `max_inner_product` and `cosine_distance` +Also supports `max_inner_product`, `cosine_distance`, `l1_distance`, `hamming_distance`, and `jaccard_distance` + +Get the distance + +```python +session.exec(select(Item.embedding.l2_distance([3, 1, 2]))) +``` + +Get items within a certain distance + +```python +session.exec(select(Item).filter(Item.embedding.l2_distance([3, 1, 2]) < 5)) +``` + +Average vectors + +```python +from pgvector.sqlalchemy import avg + +session.exec(select(avg(Item.embedding))).first() +``` + +Also supports `sum` + +Add an approximate index + +```python +from sqlmodel import Index + +index = Index( + 'my_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +# or +index = Index( + 'my_index', + Item.embedding, + postgresql_using='ivfflat', + postgresql_with={'lists': 100}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) + +index.create(engine) +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance ## Psycopg 3 @@ -239,7 +409,7 @@ Enable the extension conn.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.psycopg import register_vector @@ -247,6 +417,15 @@ from pgvector.psycopg import register_vector register_vector(conn) ``` +For [connection pools](https://www.psycopg.org/psycopg3/docs/advanced/pool.html), use + +```python +def configure(conn): + register_vector(conn) + +pool = ConnectionPool(..., configure=configure) +``` + For [async connections](https://www.psycopg.org/psycopg3/docs/advanced/async.html), use ```python @@ -255,19 +434,35 @@ from pgvector.psycopg import register_vector_async await register_vector_async(conn) ``` +Create a table + +```python +conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +``` + Insert a vector ```python embedding = np.array([1, 2, 3]) -conn.execute('INSERT INTO item (embedding) VALUES (%s)', (embedding,)) +conn.execute('INSERT INTO items (embedding) VALUES (%s)', (embedding,)) ``` Get the nearest neighbors to a vector ```python -conn.execute('SELECT * FROM item ORDER BY embedding <-> %s LIMIT 5', (embedding,)).fetchall() +conn.execute('SELECT * FROM items ORDER BY embedding <-> %s LIMIT 5', (embedding,)).fetchall() ``` +Add an approximate index + +```python +conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +conn.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Psycopg 2 Enable the extension @@ -277,7 +472,7 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection or cursor +Register the types with your connection or cursor ```python from pgvector.psycopg2 import register_vector @@ -285,20 +480,36 @@ from pgvector.psycopg2 import register_vector register_vector(conn) ``` +Create a table + +```python +cur.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +``` + Insert a vector ```python embedding = np.array([1, 2, 3]) -cur.execute('INSERT INTO item (embedding) VALUES (%s)', (embedding,)) +cur.execute('INSERT INTO items (embedding) VALUES (%s)', (embedding,)) ``` Get the nearest neighbors to a vector ```python -cur.execute('SELECT * FROM item ORDER BY embedding <-> %s LIMIT 5', (embedding,)) +cur.execute('SELECT * FROM items ORDER BY embedding <-> %s LIMIT 5', (embedding,)) cur.fetchall() ``` +Add an approximate index + +```python +cur.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +cur.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## asyncpg Enable the extension @@ -307,7 +518,7 @@ Enable the extension await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.asyncpg import register_vector @@ -324,19 +535,80 @@ async def init(conn): pool = await asyncpg.create_pool(..., init=init) ``` +Create a table + +```python +await conn.execute('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +``` + Insert a vector ```python embedding = np.array([1, 2, 3]) -await conn.execute('INSERT INTO item (embedding) VALUES ($1)', embedding) +await conn.execute('INSERT INTO items (embedding) VALUES ($1)', embedding) ``` Get the nearest neighbors to a vector ```python -await conn.fetch('SELECT * FROM item ORDER BY embedding <-> $1 LIMIT 5', embedding) +await conn.fetch('SELECT * FROM items ORDER BY embedding <-> $1 LIMIT 5', embedding) +``` + +Add an approximate index + +```python +await conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +await conn.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') ``` +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + +## pg8000 + +Enable the extension + +```python +conn.run('CREATE EXTENSION IF NOT EXISTS vector') +``` + +Register the types with your connection + +```python +from pgvector.pg8000 import register_vector + +register_vector(conn) +``` + +Create a table + +```python +conn.run('CREATE TABLE items (id bigserial PRIMARY KEY, embedding vector(3))') +``` + +Insert a vector + +```python +embedding = np.array([1, 2, 3]) +conn.run('INSERT INTO items (embedding) VALUES (:embedding)', embedding=embedding) +``` + +Get the nearest neighbors to a vector + +```python +conn.run('SELECT * FROM items ORDER BY embedding <-> :embedding LIMIT 5', embedding=embedding) +``` + +Add an approximate index + +```python +conn.run('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') +# or +conn.run('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 100)') +``` + +Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance + ## Peewee Add a vector column @@ -348,6 +620,8 @@ class Item(BaseModel): embedding = VectorField(dimensions=3) ``` +Also supports `HalfVectorField`, `FixedBitField`, and `SparseVectorField` + Insert a vector ```python @@ -360,7 +634,7 @@ Get the nearest neighbors to a vector Item.select().order_by(Item.embedding.l2_distance([3, 1, 2])).limit(5) ``` -Also supports `max_inner_product` and `cosine_distance` +Also supports `max_inner_product`, `cosine_distance`, `l1_distance`, `hamming_distance`, and `jaccard_distance` Get the distance @@ -379,7 +653,7 @@ Average vectors ```python from peewee import fn -Item.select(fn.avg(Item.embedding)).scalar() +Item.select(fn.avg(Item.embedding).coerce(True)).scalar() ``` Also supports `sum` @@ -392,6 +666,99 @@ Item.add_index('embedding vector_l2_ops', using='hnsw') Use `vector_ip_ops` for inner product and `vector_cosine_ops` for cosine distance +## Reference + +### Half Vectors + +Create a half vector from a list + +```python +vec = HalfVector([1, 2, 3]) +``` + +Or a NumPy array + +```python +vec = HalfVector(np.array([1, 2, 3])) +``` + +Get a list + +```python +lst = vec.to_list() +``` + +Get a NumPy array + +```python +arr = vec.to_numpy() +``` + +### Sparse Vectors + +Create a sparse vector from a list + +```python +vec = SparseVector([1, 0, 2, 0, 3, 0]) +``` + +Or a NumPy array + +```python +vec = SparseVector(np.array([1, 0, 2, 0, 3, 0])) +``` + +Or a SciPy sparse array + +```python +arr = coo_array(([1, 2, 3], ([0, 2, 4],)), shape=(6,)) +vec = SparseVector(arr) +``` + +Or a dictionary of non-zero elements + +```python +vec = SparseVector({0: 1, 2: 2, 4: 3}, 6) +``` + +Note: Indices start at 0 + +Get the number of dimensions + +```python +dim = vec.dimensions() +``` + +Get the indices of non-zero elements + +```python +indices = vec.indices() +``` + +Get the values of non-zero elements + +```python +values = vec.values() +``` + +Get a list + +```python +lst = vec.to_list() +``` + +Get a NumPy array + +```python +arr = vec.to_numpy() +``` + +Get a SciPy sparse array + +```python +arr = vec.to_coo() +``` + ## History View the [changelog](https://github.com/pgvector/pgvector-python/blob/master/CHANGELOG.md) @@ -414,3 +781,12 @@ pip install -r requirements.txt createdb pgvector_python_test pytest ``` + +To run an example: + +```sh +cd examples/loading +pip install -r requirements.txt +createdb pgvector_example +python3 example.py +``` diff --git a/examples/citus/example.py b/examples/citus/example.py new file mode 100644 index 0000000..915c25f --- /dev/null +++ b/examples/citus/example.py @@ -0,0 +1,49 @@ +import numpy as np +from pgvector.psycopg import register_vector +import psycopg + +# generate random data +rows = 100000 +dimensions = 128 +embeddings = np.random.rand(rows, dimensions) +categories = np.random.randint(100, size=rows).tolist() +queries = np.random.rand(10, dimensions) + +# enable extensions +conn = psycopg.connect(dbname='pgvector_citus', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS citus') +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + +# GUC variables set on the session do not propagate to Citus workers +# https://github.com/citusdata/citus/issues/462 +# you can either: +# 1. set them on the system, user, or database and reconnect +# 2. set them for a transaction with SET LOCAL +conn.execute("ALTER DATABASE pgvector_citus SET maintenance_work_mem = '512MB'") +conn.execute('ALTER DATABASE pgvector_citus SET hnsw.ef_search = 20') +conn.close() + +# reconnect for updated GUC variables to take effect +conn = psycopg.connect(dbname='pgvector_citus', autocommit=True) +register_vector(conn) + +print('Creating distributed table') +conn.execute('DROP TABLE IF EXISTS items') +conn.execute('CREATE TABLE items (id bigserial, embedding vector(%d), category_id bigint, PRIMARY KEY (id, category_id))' % dimensions) +conn.execute('SET citus.shard_count = 4') +conn.execute("SELECT create_distributed_table('items', 'category_id')") + +print('Loading data in parallel') +with conn.cursor().copy('COPY items (embedding, category_id) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['vector', 'bigint']) + + for i in range(rows): + copy.write_row([embeddings[i], categories[i]]) + +print('Creating index in parallel') +conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_l2_ops)') + +print('Running distributed queries') +for query in queries: + items = conn.execute('SELECT id FROM items ORDER BY embedding <-> %s LIMIT 10', (query,)).fetchall() + print([r[0] for r in items]) diff --git a/examples/citus/requirements.txt b/examples/citus/requirements.txt new file mode 100644 index 0000000..1cf8ee9 --- /dev/null +++ b/examples/citus/requirements.txt @@ -0,0 +1,3 @@ +numpy +pgvector +psycopg[binary] diff --git a/examples/cohere/example.py b/examples/cohere/example.py new file mode 100644 index 0000000..5ef4eec --- /dev/null +++ b/examples/cohere/example.py @@ -0,0 +1,34 @@ +import cohere +import numpy as np +from pgvector.psycopg import register_vector, Bit +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))') + + +def embed(input, input_type): + co = cohere.ClientV2() + response = co.embed(texts=input, model='embed-v4.0', input_type=input_type, embedding_types=['ubinary']) + return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] + + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = embed(input, 'search_document') +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, Bit(embedding))) + +query = 'forest' +query_embedding = embed([query], 'search_query')[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <~> %s LIMIT 5', (Bit(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/cohere/requirements.txt b/examples/cohere/requirements.txt new file mode 100644 index 0000000..22fd056 --- /dev/null +++ b/examples/cohere/requirements.txt @@ -0,0 +1,3 @@ +cohere +pgvector +psycopg[binary] diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py new file mode 100644 index 0000000..41f88b2 --- /dev/null +++ b/examples/colbert/approximate.py @@ -0,0 +1,80 @@ +# based on section 3.6 of https://arxiv.org/abs/2004.12832 + +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector +import psycopg +import warnings + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('DROP TABLE IF EXISTS document_embeddings') + +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') +conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') + +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +warnings.filterwarnings('ignore') # ignore warnings from colbert + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + with conn.transaction(): + result = conn.execute('INSERT INTO documents (content) VALUES (%s) RETURNING id', (content,)).fetchone() + params = [] + for embedding in embeddings: + params.extend([result[0], embedding.numpy()]) + values = ', '.join(['(%s, %s)' for _ in embeddings]) + conn.execute(f'INSERT INTO document_embeddings (document_id, embedding) VALUES {values}', params) + +conn.execute('CREATE INDEX ON document_embeddings (document_id)') +conn.execute('CREATE INDEX ON document_embeddings USING hnsw (embedding vector_cosine_ops)') + +query = 'puppy' +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query])[0]] +approximate_stage = ' UNION ALL '.join(['(SELECT document_id FROM document_embeddings ORDER BY embedding <=> %s LIMIT 5)' for _ in query_embeddings]) +sql = f""" +WITH approximate_stage AS ( + {approximate_stage} +), +embeddings AS ( + SELECT document_id, array_agg(embedding) AS embeddings FROM document_embeddings + WHERE document_id IN (SELECT DISTINCT document_id FROM approximate_stage) + GROUP BY document_id +) +SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents +INNER JOIN embeddings ON embeddings.document_id = documents.id +ORDER BY max_sim DESC LIMIT 10 +""" +params = query_embeddings + [query_embeddings] +result = conn.execute(sql, params).fetchall() +for row in result: + print(row) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py new file mode 100644 index 0000000..e6a2936 --- /dev/null +++ b/examples/colbert/exact.py @@ -0,0 +1,51 @@ +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector +import psycopg +import warnings + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embeddings vector(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +warnings.filterwarnings('ignore') # ignore warnings from colbert + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + embeddings = [e.numpy() for e in embeddings] + conn.execute('INSERT INTO documents (content, embeddings) VALUES (%s, %s)', (content, embeddings)) + +query = 'puppy' +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query])[0]] +result = conn.execute('SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) diff --git a/examples/colbert/requirements.txt b/examples/colbert/requirements.txt new file mode 100644 index 0000000..54b2cb9 --- /dev/null +++ b/examples/colbert/requirements.txt @@ -0,0 +1,4 @@ +colbert-ai +pgvector +psycopg[binary] +transformers==4.49.0 diff --git a/examples/colpali/exact.py b/examples/colpali/exact.py new file mode 100644 index 0000000..80bb603 --- /dev/null +++ b/examples/colpali/exact.py @@ -0,0 +1,56 @@ +from colpali_engine.models import ColQwen2, ColQwen2Processor +from colpali_engine.utils.torch_utils import get_torch_device +from datasets import load_dataset +from pgvector.psycopg import register_vector, Bit +import psycopg +import torch + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings bit(128)[])') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +device = get_torch_device('auto') +model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval() +processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0') + + +def generate_embeddings(processed): + with torch.no_grad(): + return model(**processed.to(model.device)).to(torch.float32).numpy(force=True) + + +def binary_quantize(embedding): + return Bit(embedding > 0) + + +input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image'] +for content in input: + embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_images([content]))[0]] + conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,)) + +query = 'dividend' +query_embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_queries([query]))[0]] +result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall() +for row in result: + print(row) diff --git a/examples/colpali/requirements.txt b/examples/colpali/requirements.txt new file mode 100644 index 0000000..4cf770d --- /dev/null +++ b/examples/colpali/requirements.txt @@ -0,0 +1,4 @@ +colpali-engine +datasets +pgvector +psycopg[binary] diff --git a/examples/openai_embeddings.py b/examples/gensim/example.py similarity index 58% rename from examples/openai_embeddings.py rename to examples/gensim/example.py index 08926f0..cfbf18d 100644 --- a/examples/openai_embeddings.py +++ b/examples/gensim/example.py @@ -1,4 +1,7 @@ -import openai +from gensim.corpora.dictionary import Dictionary +from gensim.models import LdaModel +from gensim.utils import simple_preprocess +import numpy as np from pgvector.psycopg import register_vector import psycopg @@ -8,7 +11,7 @@ register_vector(conn) conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(20))') input = [ 'The dog is barking', @@ -16,10 +19,14 @@ 'The bear is growling' ] -response = openai.Embedding.create(input=input, model='text-embedding-ada-002') -embeddings = [v['embedding'] for v in response['data']] +docs = [simple_preprocess(content) for content in input] +dictionary = Dictionary(docs) +dictionary.filter_extremes(no_below=1) +corpus = [dictionary.doc2bow(tokens) for tokens in docs] +model = LdaModel(corpus, num_topics=20) -for content, embedding in zip(input, embeddings): +for content, bow in zip(input, corpus): + embedding = np.array([v[1] for v in model.get_document_topics(bow, minimum_probability=0)]) conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) document_id = 1 diff --git a/examples/gensim/requirements.txt b/examples/gensim/requirements.txt new file mode 100644 index 0000000..15411cd --- /dev/null +++ b/examples/gensim/requirements.txt @@ -0,0 +1,5 @@ +gensim +numpy +pgvector +psycopg[binary] +scipy<1.13 diff --git a/examples/hybrid_search.py b/examples/hybrid_search/cross_encoder.py similarity index 100% rename from examples/hybrid_search.py rename to examples/hybrid_search/cross_encoder.py diff --git a/examples/hybrid_search/requirements.txt b/examples/hybrid_search/requirements.txt new file mode 100644 index 0000000..237dcd1 --- /dev/null +++ b/examples/hybrid_search/requirements.txt @@ -0,0 +1,3 @@ +pgvector +psycopg[binary] +sentence-transformers diff --git a/examples/hybrid_search_rrf.py b/examples/hybrid_search/rrf.py similarity index 100% rename from examples/hybrid_search_rrf.py rename to examples/hybrid_search/rrf.py diff --git a/examples/pytorch_image_search.py b/examples/image_search/example.py similarity index 67% rename from examples/pytorch_image_search.py rename to examples/image_search/example.py index 0cc9af0..b4cf131 100644 --- a/examples/pytorch_image_search.py +++ b/examples/image_search/example.py @@ -8,13 +8,11 @@ seed = True - # establish connection conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') register_vector(conn) - # load images transform = torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), @@ -23,7 +21,6 @@ dataset = torchvision.datasets.CIFAR10(root=tempfile.gettempdir(), train=True, download=True, transform=transform) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1000) - # load pretrained model device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') model = torchvision.models.resnet18(weights='DEFAULT') @@ -36,36 +33,37 @@ def generate_embeddings(inputs): return model(inputs.to(device)).detach().cpu().numpy() -# generate, save, and index embeddings +# generate and store embeddings if seed: - conn.execute('DROP TABLE IF EXISTS image') - conn.execute('CREATE TABLE image (id bigserial PRIMARY KEY, embedding vector(512))') + conn.execute('DROP TABLE IF EXISTS images') + conn.execute('CREATE TABLE images (id bigserial PRIMARY KEY, embedding vector(512))') print('Generating embeddings') for data in tqdm(dataloader): embeddings = generate_embeddings(data[0]) - sql = 'INSERT INTO image (embedding) VALUES ' + ','.join(['(%s)' for _ in embeddings]) + sql = 'INSERT INTO images (embedding) VALUES ' + ','.join(['(%s)' for _ in embeddings]) params = [embedding for embedding in embeddings] conn.execute(sql, params) - -def show_images(dataset_images): - grid = torchvision.utils.make_grid(dataset_images) - img = (grid / 2 + 0.5).permute(1, 2, 0).numpy() - plt.imshow(img) - plt.draw() - plt.waitforbuttonpress(timeout=3) - - # load 5 random unseen images queryset = torchvision.datasets.CIFAR10(root=tempfile.gettempdir(), train=False, download=True, transform=transform) queryloader = torch.utils.data.DataLoader(queryset, batch_size=5, shuffle=True) images = next(iter(queryloader))[0] - # generate and query embeddings +results = [] embeddings = generate_embeddings(images) for image, embedding in zip(images, embeddings): - result = conn.execute('SELECT id FROM image ORDER BY embedding <=> %s LIMIT 15', (embedding,)).fetchall() - show_images([image] + [dataset[row[0] - 1][0] for row in result]) + result = conn.execute('SELECT id FROM images ORDER BY embedding <=> %s LIMIT 5', (embedding,)).fetchall() + nearest_images = [dataset[row[0] - 1][0] for row in result] + results.append([image] + nearest_images) + +# show images +fig, axs = plt.subplots(len(results), len(results[0])) +for i, result in enumerate(results): + for j, image in enumerate(result): + ax = axs[i, j] + ax.imshow((image / 2 + 0.5).permute(1, 2, 0).numpy()) + ax.set_axis_off() +plt.show(block=True) diff --git a/examples/image_search/requirements.txt b/examples/image_search/requirements.txt new file mode 100644 index 0000000..3d82365 --- /dev/null +++ b/examples/image_search/requirements.txt @@ -0,0 +1,6 @@ +matplotlib +pgvector +psycopg[binary] +torch +torchvision +tqdm diff --git a/examples/imagehash/example.py b/examples/imagehash/example.py new file mode 100644 index 0000000..f49af40 --- /dev/null +++ b/examples/imagehash/example.py @@ -0,0 +1,47 @@ +from datasets import load_dataset +from imagehash import phash +import matplotlib.pyplot as plt +from pgvector.psycopg import register_vector, Bit +import psycopg + + +def hash_image(img): + return ''.join(['1' if v else '0' for v in phash(img).hash.flatten()]) + + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS images') +conn.execute('CREATE TABLE images (id bigserial PRIMARY KEY, hash bit(64))') + +print('Loading dataset') +dataset = load_dataset('mnist') + +print('Generating hashes') +images = [{'hash': hash_image(row['image'])} for row in dataset['train']] + +print('Storing hashes') +cur = conn.cursor() +with cur.copy('COPY images (hash) FROM STDIN') as copy: + for image in images: + copy.write_row([Bit(image['hash'])]) + +print('Querying hashes') +results = [] +for i in range(5): + image = dataset['test'][i]['image'] + result = conn.execute('SELECT id FROM images ORDER BY hash <~> %s LIMIT 5', (hash_image(image),)).fetchall() + nearest_images = [dataset['train'][row[0] - 1]['image'] for row in result] + results.append([image] + nearest_images) + +print('Showing results (first column is query image)') +fig, axs = plt.subplots(len(results), len(results[0])) +for i, result in enumerate(results): + for j, image in enumerate(result): + ax = axs[i, j] + ax.imshow(image) + ax.set_axis_off() +plt.show(block=True) diff --git a/examples/imagehash/requirements.txt b/examples/imagehash/requirements.txt new file mode 100644 index 0000000..e3971e6 --- /dev/null +++ b/examples/imagehash/requirements.txt @@ -0,0 +1,5 @@ +datasets +imagehash +matplotlib +pgvector +psycopg[binary] diff --git a/examples/implicit_recs.py b/examples/implicit/example.py similarity index 100% rename from examples/implicit_recs.py rename to examples/implicit/example.py diff --git a/examples/implicit/requirements.txt b/examples/implicit/requirements.txt new file mode 100644 index 0000000..424abbd --- /dev/null +++ b/examples/implicit/requirements.txt @@ -0,0 +1,5 @@ +h5py +implicit +pgvector +psycopg[binary] +SQLAlchemy diff --git a/examples/lightfm_recs.py b/examples/lightfm/example.py similarity index 100% rename from examples/lightfm_recs.py rename to examples/lightfm/example.py diff --git a/examples/lightfm/requirements.txt b/examples/lightfm/requirements.txt new file mode 100644 index 0000000..cfa5f51 --- /dev/null +++ b/examples/lightfm/requirements.txt @@ -0,0 +1,4 @@ +lightfm +pgvector +psycopg[binary] +SQLAlchemy diff --git a/examples/loading/example.py b/examples/loading/example.py new file mode 100644 index 0000000..7f3dce8 --- /dev/null +++ b/examples/loading/example.py @@ -0,0 +1,45 @@ +import numpy as np +from pgvector.psycopg import register_vector +import psycopg + +# generate random data +rows = 1000000 +dimensions = 128 +embeddings = np.random.rand(rows, dimensions) + +# enable extension +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +# create table +conn.execute('DROP TABLE IF EXISTS items') +conn.execute(f'CREATE TABLE items (id bigserial, embedding vector({dimensions}))') + +# load data +print(f'Loading {len(embeddings)} rows') +cur = conn.cursor() +with cur.copy('COPY items (embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + # use set_types for binary copy + # https://www.psycopg.org/psycopg3/docs/basic/copy.html#binary-copy + copy.set_types(['vector']) + + for i, embedding in enumerate(embeddings): + copy.write_row([embedding]) + + # show progress + if i % 10000 == 0: + print('.', end='', flush=True) + +print('\nSuccess!') + +# create any indexes *after* loading initial data (skipping for this example) +create_index = False +if create_index: + print('Creating index') + conn.execute("SET maintenance_work_mem = '8GB'") + conn.execute('SET max_parallel_maintenance_workers = 7') + conn.execute('CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops)') + +# update planner statistics for good measure +conn.execute('ANALYZE items') diff --git a/examples/loading/requirements.txt b/examples/loading/requirements.txt new file mode 100644 index 0000000..1cf8ee9 --- /dev/null +++ b/examples/loading/requirements.txt @@ -0,0 +1,3 @@ +numpy +pgvector +psycopg[binary] diff --git a/examples/openai/example.py b/examples/openai/example.py new file mode 100644 index 0000000..b9a078c --- /dev/null +++ b/examples/openai/example.py @@ -0,0 +1,34 @@ +import numpy as np +from openai import OpenAI +from pgvector.psycopg import register_vector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))') + + +def embed(input): + client = OpenAI() + response = client.embeddings.create(input=input, model='text-embedding-3-small') + return [v.embedding for v in response.data] + + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = embed(input) +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, np.array(embedding))) + +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (np.array(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/openai/halfvec.py b/examples/openai/halfvec.py new file mode 100644 index 0000000..185c785 --- /dev/null +++ b/examples/openai/halfvec.py @@ -0,0 +1,34 @@ +from openai import OpenAI +from pgvector.psycopg import register_vector, HalfVector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding halfvec(3072))') +conn.execute('CREATE INDEX ON documents USING hnsw (embedding halfvec_cosine_ops)') + + +def embed(input): + client = OpenAI() + response = client.embeddings.create(input=input, model='text-embedding-3-large') + return [v.embedding for v in response.data] + + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = embed(input) +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, HalfVector(embedding))) + +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (HalfVector(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt new file mode 100644 index 0000000..18587e2 --- /dev/null +++ b/examples/openai/requirements.txt @@ -0,0 +1,3 @@ +openai +pgvector +psycopg[binary] diff --git a/examples/rag/example.py b/examples/rag/example.py new file mode 100644 index 0000000..4d5d307 --- /dev/null +++ b/examples/rag/example.py @@ -0,0 +1,65 @@ +# Run: +# ollama pull llama3.2 +# ollama pull nomic-embed-text +# ollama serve + +import numpy as np +import ollama +from pathlib import Path +from pgvector.psycopg import register_vector +import psycopg +import urllib.request + +query = 'What index types are supported?' +load_data = True + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +if load_data: + # get data + url = 'https://raw.githubusercontent.com/pgvector/pgvector/refs/heads/master/README.md' + dest = Path(__file__).parent / 'README.md' + if not dest.exists(): + urllib.request.urlretrieve(url, dest) + + with open(dest, encoding='utf-8') as f: + doc = f.read() + + # generate chunks + # TODO improve chunking + # TODO remove markdown + chunks = doc.split('\n## ') + + # embed chunks + # nomic-embed-text has task instruction prefix + input = ['search_document: ' + chunk for chunk in chunks] + embeddings = ollama.embed(model='nomic-embed-text', input=input).embeddings + + # create table + conn.execute('DROP TABLE IF EXISTS chunks') + conn.execute('CREATE TABLE chunks (id bigserial PRIMARY KEY, content text, embedding vector(768))') + + # store chunks + cur = conn.cursor() + with cur.copy('COPY chunks (content, embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['text', 'vector']) + + for content, embedding in zip(chunks, embeddings): + copy.write_row([content, embedding]) + +# embed query +# nomic-embed-text has task instruction prefix +input = 'search_query: ' + query +embedding = ollama.embed(model='nomic-embed-text', input=input).embeddings[0] + +# retrieve chunks +result = conn.execute('SELECT content FROM chunks ORDER BY embedding <=> %s LIMIT 5', (np.array(embedding),)).fetchall() +context = '\n\n'.join([row[0] for row in result]) + +# get answer +# TODO improve prompt +prompt = f'Answer this question: {query}\n\n{context}' +response = ollama.generate(model='llama3.2', prompt=prompt).response +print(response) diff --git a/examples/rag/requirements.txt b/examples/rag/requirements.txt new file mode 100644 index 0000000..4eb5864 --- /dev/null +++ b/examples/rag/requirements.txt @@ -0,0 +1,3 @@ +ollama +pgvector +psycopg[binary] diff --git a/examples/rdkit/example.py b/examples/rdkit/example.py new file mode 100644 index 0000000..afb56ec --- /dev/null +++ b/examples/rdkit/example.py @@ -0,0 +1,32 @@ +# good resource +# https://www.rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints + +from pgvector.psycopg import register_vector, Bit +import psycopg +from rdkit import Chem +from rdkit.Chem import AllChem + + +def generate_fingerprint(molecule): + fpgen = AllChem.GetMorganGenerator() + return fpgen.GetFingerprintAsNumPy(Chem.MolFromSmiles(molecule)) + + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS molecules') +conn.execute('CREATE TABLE molecules (id text PRIMARY KEY, fingerprint bit(2048))') + +molecules = ['Cc1ccccc1', 'Cc1ncccc1', 'c1ccccn1'] +for molecule in molecules: + fingerprint = generate_fingerprint(molecule) + conn.execute('INSERT INTO molecules (id, fingerprint) VALUES (%s, %s)', (molecule, Bit(fingerprint))) + +query_molecule = 'c1ccco1' +query_fingerprint = generate_fingerprint(query_molecule) +result = conn.execute('SELECT id, fingerprint <%%> %s AS distance FROM molecules ORDER BY distance LIMIT 5', (Bit(query_fingerprint),)).fetchall() +for row in result: + print(row) diff --git a/examples/rdkit/requirements.txt b/examples/rdkit/requirements.txt new file mode 100644 index 0000000..85a3e4f --- /dev/null +++ b/examples/rdkit/requirements.txt @@ -0,0 +1,3 @@ +pgvector +psycopg[binary] +rdkit diff --git a/examples/requirements.txt b/examples/requirements.txt deleted file mode 100644 index 4be7ae1..0000000 --- a/examples/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -implicit -lightfm -openai -torch -torchvision -scikit-surprise -sentence-transformers diff --git a/examples/sentence_embeddings.py b/examples/sentence_transformers/example.py similarity index 69% rename from examples/sentence_embeddings.py rename to examples/sentence_transformers/example.py index d4e7f96..50997d9 100644 --- a/examples/sentence_embeddings.py +++ b/examples/sentence_transformers/example.py @@ -10,19 +10,19 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(384))') +model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1') + input = [ 'The dog is barking', 'The cat is purring', 'The bear is growling' ] - -model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(input) - for content, embedding in zip(input, embeddings): conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding)) -document_id = 1 -neighbors = conn.execute('SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5', {'id': document_id}).fetchall() -for neighbor in neighbors: - print(neighbor[0]) +query = 'forest' +query_embedding = model.encode(query) +result = conn.execute('SELECT content FROM documents ORDER BY embedding <=> %s LIMIT 5', (query_embedding,)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/sentence_transformers/requirements.txt b/examples/sentence_transformers/requirements.txt new file mode 100644 index 0000000..237dcd1 --- /dev/null +++ b/examples/sentence_transformers/requirements.txt @@ -0,0 +1,3 @@ +pgvector +psycopg[binary] +sentence-transformers diff --git a/examples/sparse_search/example.py b/examples/sparse_search/example.py new file mode 100644 index 0000000..2b5daea --- /dev/null +++ b/examples/sparse_search/example.py @@ -0,0 +1,53 @@ +# good resources +# https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/ +# https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1 + +from pgvector.psycopg import register_vector, SparseVector +import psycopg +import torch +from transformers import AutoModelForMaskedLM, AutoTokenizer + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))') + +model_id = 'opensearch-project/opensearch-neural-sparse-encoding-v1' +model = AutoModelForMaskedLM.from_pretrained(model_id) +tokenizer = AutoTokenizer.from_pretrained(model_id) +special_token_ids = [tokenizer.vocab[token] for token in tokenizer.special_tokens_map.values()] + + +def embed(input): + feature = tokenizer( + input, + padding=True, + truncation=True, + return_tensors='pt', + return_token_type_ids=False + ) + output = model(**feature)[0] + + values, _ = torch.max(output * feature['attention_mask'].unsqueeze(-1), dim=1) + values = torch.log(1 + torch.relu(values)) + values[:, special_token_ids] = 0 + return values.detach().cpu().numpy() + + +# note: works much better with longer content +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +embeddings = embed(input) +for content, embedding in zip(input, embeddings): + conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, SparseVector(embedding))) + +query = 'forest' +query_embedding = embed([query])[0] +result = conn.execute('SELECT content FROM documents ORDER BY embedding <#> %s LIMIT 5', (SparseVector(query_embedding),)).fetchall() +for row in result: + print(row[0]) diff --git a/examples/sparse_search/requirements.txt b/examples/sparse_search/requirements.txt new file mode 100644 index 0000000..3de81c7 --- /dev/null +++ b/examples/sparse_search/requirements.txt @@ -0,0 +1,5 @@ +numpy +pgvector +psycopg[binary] +torch +transformers diff --git a/examples/surprise_recs.py b/examples/surprise/example.py similarity index 100% rename from examples/surprise_recs.py rename to examples/surprise/example.py diff --git a/examples/surprise/requirements.txt b/examples/surprise/requirements.txt new file mode 100644 index 0000000..cb2dca4 --- /dev/null +++ b/examples/surprise/requirements.txt @@ -0,0 +1,4 @@ +pgvector +psycopg[binary] +scikit-surprise +SQLAlchemy diff --git a/pgvector/__init__.py b/pgvector/__init__.py new file mode 100644 index 0000000..3c01160 --- /dev/null +++ b/pgvector/__init__.py @@ -0,0 +1,11 @@ +from .bit import Bit +from .halfvec import HalfVector +from .sparsevec import SparseVector +from .vector import Vector + +__all__ = [ + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] diff --git a/pgvector/asyncpg/__init__.py b/pgvector/asyncpg/__init__.py index 5797327..c6a3b4e 100644 --- a/pgvector/asyncpg/__init__.py +++ b/pgvector/asyncpg/__init__.py @@ -1,12 +1,11 @@ -from ..utils import from_db, from_db_binary, to_db, to_db_binary +from .register import register_vector -__all__ = ['register_vector'] +# TODO remove +from .. import Vector, HalfVector, SparseVector - -async def register_vector(conn): - await conn.set_type_codec( - 'vector', - encoder=to_db_binary, - decoder=from_db_binary, - format='binary' - ) +__all__ = [ + 'register_vector', + 'Vector', + 'HalfVector', + 'SparseVector' +] diff --git a/pgvector/asyncpg/register.py b/pgvector/asyncpg/register.py new file mode 100644 index 0000000..63726f3 --- /dev/null +++ b/pgvector/asyncpg/register.py @@ -0,0 +1,31 @@ +from .. import Vector, HalfVector, SparseVector + + +async def register_vector(conn, schema='public'): + await conn.set_type_codec( + 'vector', + schema=schema, + encoder=Vector._to_db_binary, + decoder=Vector._from_db_binary, + format='binary' + ) + + try: + await conn.set_type_codec( + 'halfvec', + schema=schema, + encoder=HalfVector._to_db_binary, + decoder=HalfVector._from_db_binary, + format='binary' + ) + + await conn.set_type_codec( + 'sparsevec', + schema=schema, + encoder=SparseVector._to_db_binary, + decoder=SparseVector._from_db_binary, + format='binary' + ) + except ValueError as e: + if not str(e).startswith('unknown type:'): + raise e diff --git a/pgvector/bit.py b/pgvector/bit.py new file mode 100644 index 0000000..26a9d8d --- /dev/null +++ b/pgvector/bit.py @@ -0,0 +1,75 @@ +import numpy as np +from struct import pack, unpack_from +from warnings import warn + + +class Bit: + def __init__(self, value): + if isinstance(value, bytes): + self._len = 8 * len(value) + self._data = value + else: + if isinstance(value, str): + value = [v != '0' for v in value] + else: + value = np.asarray(value) + + if value.dtype != np.bool: + # skip warning for result of np.unpackbits + if value.dtype != np.uint8 or np.any(value > 1): + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) + + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + self._len = len(value) + self._data = np.packbits(value).tobytes() + + def __repr__(self): + return f'Bit({self.to_text()})' + + def __eq__(self, other): + if isinstance(other, self.__class__): + return self._len == other._len and self._data == other._data + return False + + def to_list(self): + return self.to_numpy().tolist() + + def to_numpy(self): + return np.unpackbits(np.frombuffer(self._data, dtype=np.uint8), count=self._len).astype(bool) + + def to_text(self): + return ''.join(format(v, '08b') for v in self._data)[:self._len] + + def to_binary(self): + return pack('>i', self._len) + self._data + + @classmethod + def from_text(cls, value): + return cls(str(value)) + + @classmethod + def from_binary(cls, value): + if not isinstance(value, bytes): + raise ValueError('expected bytes') + + bit = cls.__new__(cls) + bit._len = unpack_from('>i', value)[0] + bit._data = value[4:] + return bit + + @classmethod + def _to_db(cls, value): + if not isinstance(value, cls): + raise ValueError('expected bit') + + return value.to_text() + + @classmethod + def _to_db_binary(cls, value): + if not isinstance(value, cls): + raise ValueError('expected bit') + + return value.to_binary() diff --git a/pgvector/django/__init__.py b/pgvector/django/__init__.py index ab250e6..43c64a3 100644 --- a/pgvector/django/__init__.py +++ b/pgvector/django/__init__.py @@ -1,130 +1,28 @@ -from django.contrib.postgres.operations import CreateExtension -from django.contrib.postgres.indexes import PostgresIndex -from django.db.models import Field, FloatField, Func, Value -import numpy as np -from .forms import VectorFormField -from ..utils import from_db, to_db - -__all__ = ['VectorExtension', 'VectorField', 'IvfflatIndex', 'HnswIndex', 'L2Distance', 'MaxInnerProduct', 'CosineDistance'] - - -class VectorExtension(CreateExtension): - def __init__(self): - self.name = 'vector' - - -# https://docs.djangoproject.com/en/4.2/howto/custom-model-fields/ -class VectorField(Field): - description = 'Vector' - empty_strings_allowed = False - - def __init__(self, *args, dimensions=None, **kwargs): - self.dimensions = dimensions - super().__init__(*args, **kwargs) - - def deconstruct(self): - name, path, args, kwargs = super().deconstruct() - if self.dimensions is not None: - kwargs['dimensions'] = self.dimensions - return name, path, args, kwargs - - def db_type(self, connection): - if self.dimensions is None: - return 'vector' - return 'vector(%d)' % self.dimensions - - def from_db_value(self, value, expression, connection): - return from_db(value) - - def to_python(self, value): - if isinstance(value, list): - return np.array(value, dtype=np.float32) - return from_db(value) - - def get_prep_value(self, value): - return to_db(value) - - def value_to_string(self, obj): - return self.get_prep_value(self.value_from_object(obj)) - - def validate(self, value, model_instance): - if isinstance(value, np.ndarray): - value = value.tolist() - super().validate(value, model_instance) - - def run_validators(self, value): - if isinstance(value, np.ndarray): - value = value.tolist() - super().run_validators(value) - - def formfield(self, **kwargs): - return super().formfield(form_class=VectorFormField, **kwargs) - - -class IvfflatIndex(PostgresIndex): - suffix = 'ivfflat' - - def __init__(self, *expressions, lists=None, **kwargs): - self.lists = lists - super().__init__(*expressions, **kwargs) - - def deconstruct(self): - path, args, kwargs = super().deconstruct() - if self.lists is not None: - kwargs['lists'] = self.lists - return path, args, kwargs - - def get_with_params(self): - with_params = [] - if self.lists is not None: - with_params.append('lists = %d' % self.lists) - return with_params - - -class HnswIndex(PostgresIndex): - suffix = 'hnsw' - - def __init__(self, *expressions, m=None, ef_construction=None, **kwargs): - self.m = m - self.ef_construction = ef_construction - super().__init__(*expressions, **kwargs) - - def deconstruct(self): - path, args, kwargs = super().deconstruct() - if self.m is not None: - kwargs['m'] = self.m - if self.ef_construction is not None: - kwargs['ef_construction'] = self.ef_construction - return path, args, kwargs - - def get_with_params(self): - with_params = [] - if self.m is not None: - with_params.append('m = %d' % self.m) - if self.ef_construction is not None: - with_params.append('ef_construction = %d' % self.ef_construction) - return with_params - - -class DistanceBase(Func): - output_field = FloatField() - - def __init__(self, expression, vector, **extra): - if not hasattr(vector, 'resolve_expression'): - vector = Value(to_db(vector)) - super().__init__(expression, vector, **extra) - - -class L2Distance(DistanceBase): - function = '' - arg_joiner = ' <-> ' - - -class MaxInnerProduct(DistanceBase): - function = '' - arg_joiner = ' <#> ' - - -class CosineDistance(DistanceBase): - function = '' - arg_joiner = ' <=> ' +from .bit import BitField +from .extensions import VectorExtension +from .functions import L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance +from .halfvec import HalfVectorField +from .indexes import IvfflatIndex, HnswIndex +from .sparsevec import SparseVectorField +from .vector import VectorField + +# TODO remove +from .. import HalfVector, SparseVector + +__all__ = [ + 'VectorExtension', + 'VectorField', + 'HalfVectorField', + 'BitField', + 'SparseVectorField', + 'IvfflatIndex', + 'HnswIndex', + 'L2Distance', + 'MaxInnerProduct', + 'CosineDistance', + 'L1Distance', + 'HammingDistance', + 'JaccardDistance', + 'HalfVector', + 'SparseVector' +] diff --git a/pgvector/django/bit.py b/pgvector/django/bit.py new file mode 100644 index 0000000..2cc847a --- /dev/null +++ b/pgvector/django/bit.py @@ -0,0 +1,32 @@ +from django import forms +from django.db.models import Field + + +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ +class BitField(Field): + description = 'Bit string' + + def __init__(self, *args, length=None, **kwargs): + self.length = length + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.length is not None: + kwargs['length'] = self.length + return name, path, args, kwargs + + def db_type(self, connection): + if self.length is None: + return 'bit' + return 'bit(%d)' % self.length + + def formfield(self, **kwargs): + return super().formfield(form_class=BitFormField, **kwargs) + + +class BitFormField(forms.CharField): + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/pgvector/django/extensions.py b/pgvector/django/extensions.py new file mode 100644 index 0000000..0573f72 --- /dev/null +++ b/pgvector/django/extensions.py @@ -0,0 +1,6 @@ +from django.contrib.postgres.operations import CreateExtension + + +class VectorExtension(CreateExtension): + def __init__(self): + self.name = 'vector' diff --git a/pgvector/django/forms.py b/pgvector/django/forms.py deleted file mode 100644 index 3748236..0000000 --- a/pgvector/django/forms.py +++ /dev/null @@ -1,12 +0,0 @@ -from django import forms -import numpy as np -from .widgets import VectorWidget - - -class VectorFormField(forms.CharField): - widget = VectorWidget - - def has_changed(self, initial, data): - if isinstance(initial, np.ndarray): - initial = initial.tolist() - return super().has_changed(initial, data) diff --git a/pgvector/django/functions.py b/pgvector/django/functions.py new file mode 100644 index 0000000..9df4fdb --- /dev/null +++ b/pgvector/django/functions.py @@ -0,0 +1,59 @@ +from django.db.models import FloatField, Func, Value +from .. import Vector, HalfVector, SparseVector + + +class DistanceBase(Func): + output_field = FloatField() + + def __init__(self, expression, vector, **extra): + if not hasattr(vector, 'resolve_expression'): + if isinstance(vector, HalfVector): + vector = Value(HalfVector._to_db(vector)) + elif isinstance(vector, SparseVector): + vector = Value(SparseVector._to_db(vector)) + else: + vector = Value(Vector._to_db(vector)) + + # prevent error with unhashable types + self._constructor_args = ((expression, vector), extra) + + super().__init__(expression, vector, **extra) + + +class BitDistanceBase(Func): + output_field = FloatField() + + def __init__(self, expression, vector, **extra): + if not hasattr(vector, 'resolve_expression'): + vector = Value(vector) + super().__init__(expression, vector, **extra) + + +class L2Distance(DistanceBase): + function = '' + arg_joiner = ' <-> ' + + +class MaxInnerProduct(DistanceBase): + function = '' + arg_joiner = ' <#> ' + + +class CosineDistance(DistanceBase): + function = '' + arg_joiner = ' <=> ' + + +class L1Distance(DistanceBase): + function = '' + arg_joiner = ' <+> ' + + +class HammingDistance(BitDistanceBase): + function = '' + arg_joiner = ' <~> ' + + +class JaccardDistance(BitDistanceBase): + function = '' + arg_joiner = ' <%%> ' diff --git a/pgvector/django/halfvec.py b/pgvector/django/halfvec.py new file mode 100644 index 0000000..3aeb90f --- /dev/null +++ b/pgvector/django/halfvec.py @@ -0,0 +1,60 @@ +from django import forms +from django.db.models import Field +from .. import HalfVector + + +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ +class HalfVectorField(Field): + description = 'Half vector' + empty_strings_allowed = False + + def __init__(self, *args, dimensions=None, **kwargs): + self.dimensions = dimensions + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.dimensions is not None: + kwargs['dimensions'] = self.dimensions + return name, path, args, kwargs + + def db_type(self, connection): + if self.dimensions is None: + return 'halfvec' + return 'halfvec(%d)' % self.dimensions + + def from_db_value(self, value, expression, connection): + return HalfVector._from_db(value) + + def to_python(self, value): + if value is None or isinstance(value, HalfVector): + return value + elif isinstance(value, str): + return HalfVector._from_db(value) + else: + return HalfVector(value) + + def get_prep_value(self, value): + return HalfVector._to_db(value) + + def value_to_string(self, obj): + return self.get_prep_value(self.value_from_object(obj)) + + def formfield(self, **kwargs): + return super().formfield(form_class=HalfVectorFormField, **kwargs) + + +class HalfVectorWidget(forms.TextInput): + def format_value(self, value): + if isinstance(value, HalfVector): + value = value.to_list() + return super().format_value(value) + + +class HalfVectorFormField(forms.CharField): + widget = HalfVectorWidget + + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/pgvector/django/indexes.py b/pgvector/django/indexes.py new file mode 100644 index 0000000..5bec0eb --- /dev/null +++ b/pgvector/django/indexes.py @@ -0,0 +1,46 @@ +from django.contrib.postgres.indexes import PostgresIndex + + +class IvfflatIndex(PostgresIndex): + suffix = 'ivfflat' + + def __init__(self, *expressions, lists=None, **kwargs): + self.lists = lists + super().__init__(*expressions, **kwargs) + + def deconstruct(self): + path, args, kwargs = super().deconstruct() + if self.lists is not None: + kwargs['lists'] = self.lists + return path, args, kwargs + + def get_with_params(self): + with_params = [] + if self.lists is not None: + with_params.append('lists = %d' % self.lists) + return with_params + + +class HnswIndex(PostgresIndex): + suffix = 'hnsw' + + def __init__(self, *expressions, m=None, ef_construction=None, **kwargs): + self.m = m + self.ef_construction = ef_construction + super().__init__(*expressions, **kwargs) + + def deconstruct(self): + path, args, kwargs = super().deconstruct() + if self.m is not None: + kwargs['m'] = self.m + if self.ef_construction is not None: + kwargs['ef_construction'] = self.ef_construction + return path, args, kwargs + + def get_with_params(self): + with_params = [] + if self.m is not None: + with_params.append('m = %d' % self.m) + if self.ef_construction is not None: + with_params.append('ef_construction = %d' % self.ef_construction) + return with_params diff --git a/pgvector/django/sparsevec.py b/pgvector/django/sparsevec.py new file mode 100644 index 0000000..580f27c --- /dev/null +++ b/pgvector/django/sparsevec.py @@ -0,0 +1,55 @@ +from django import forms +from django.db.models import Field +from .. import SparseVector + + +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ +class SparseVectorField(Field): + description = 'Sparse vector' + empty_strings_allowed = False + + def __init__(self, *args, dimensions=None, **kwargs): + self.dimensions = dimensions + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.dimensions is not None: + kwargs['dimensions'] = self.dimensions + return name, path, args, kwargs + + def db_type(self, connection): + if self.dimensions is None: + return 'sparsevec' + return 'sparsevec(%d)' % self.dimensions + + def from_db_value(self, value, expression, connection): + return SparseVector._from_db(value) + + def to_python(self, value): + return SparseVector._from_db(value) + + def get_prep_value(self, value): + return SparseVector._to_db(value) + + def value_to_string(self, obj): + return self.get_prep_value(self.value_from_object(obj)) + + def formfield(self, **kwargs): + return super().formfield(form_class=SparseVectorFormField, **kwargs) + + +class SparseVectorWidget(forms.TextInput): + def format_value(self, value): + if isinstance(value, SparseVector): + value = value.to_text() + return super().format_value(value) + + +class SparseVectorFormField(forms.CharField): + widget = SparseVectorWidget + + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/pgvector/django/vector.py b/pgvector/django/vector.py new file mode 100644 index 0000000..861cfde --- /dev/null +++ b/pgvector/django/vector.py @@ -0,0 +1,73 @@ +from django import forms +from django.db.models import Field +import numpy as np +from .. import Vector + + +# https://docs.djangoproject.com/en/5.0/howto/custom-model-fields/ +class VectorField(Field): + description = 'Vector' + empty_strings_allowed = False + + def __init__(self, *args, dimensions=None, **kwargs): + self.dimensions = dimensions + super().__init__(*args, **kwargs) + + def deconstruct(self): + name, path, args, kwargs = super().deconstruct() + if self.dimensions is not None: + kwargs['dimensions'] = self.dimensions + return name, path, args, kwargs + + def db_type(self, connection): + if self.dimensions is None: + return 'vector' + return 'vector(%d)' % self.dimensions + + def from_db_value(self, value, expression, connection): + return Vector._from_db(value) + + def to_python(self, value): + if isinstance(value, list): + return np.array(value, dtype=np.float32) + return Vector._from_db(value) + + def get_prep_value(self, value): + return Vector._to_db(value) + + def value_to_string(self, obj): + return self.get_prep_value(self.value_from_object(obj)) + + def validate(self, value, model_instance): + if isinstance(value, np.ndarray): + value = value.tolist() + super().validate(value, model_instance) + + def run_validators(self, value): + if isinstance(value, np.ndarray): + value = value.tolist() + super().run_validators(value) + + def formfield(self, **kwargs): + return super().formfield(form_class=VectorFormField, **kwargs) + + +class VectorWidget(forms.TextInput): + def format_value(self, value): + if isinstance(value, np.ndarray): + value = value.tolist() + return super().format_value(value) + + +class VectorFormField(forms.CharField): + widget = VectorWidget + + def has_changed(self, initial, data): + if isinstance(initial, np.ndarray): + initial = initial.tolist() + return super().has_changed(initial, data) + + def to_python(self, value): + if isinstance(value, str) and value == '': + return None + return super().to_python(value) diff --git a/pgvector/django/widgets.py b/pgvector/django/widgets.py deleted file mode 100644 index 731d632..0000000 --- a/pgvector/django/widgets.py +++ /dev/null @@ -1,9 +0,0 @@ -from django import forms -import numpy as np - - -class VectorWidget(forms.TextInput): - def format_value(self, value): - if isinstance(value, np.ndarray): - value = value.tolist() - return super().format_value(value) diff --git a/pgvector/halfvec.py b/pgvector/halfvec.py new file mode 100644 index 0000000..f335f2f --- /dev/null +++ b/pgvector/halfvec.py @@ -0,0 +1,83 @@ +import numpy as np +from struct import pack, unpack_from + + +class HalfVector: + def __init__(self, value): + # asarray still copies if same dtype + if not isinstance(value, np.ndarray) or value.dtype != '>f2': + value = np.asarray(value, dtype='>f2') + + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + self._value = value + + def __repr__(self): + return f'HalfVector({self.to_list()})' + + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + + def dimensions(self): + return len(self._value) + + def to_list(self): + return self._value.tolist() + + def to_numpy(self): + return self._value + + def to_text(self): + return '[' + ','.join([str(float(v)) for v in self._value]) + ']' + + def to_binary(self): + return pack('>HH', self.dimensions(), 0) + self._value.tobytes() + + @classmethod + def from_text(cls, value): + return cls([float(v) for v in value[1:-1].split(',')]) + + @classmethod + def from_binary(cls, value): + dim, unused = unpack_from('>HH', value) + return cls(np.frombuffer(value, dtype='>f2', count=dim, offset=4)) + + @classmethod + def _to_db(cls, value, dim=None): + if value is None: + return value + + if not isinstance(value, cls): + value = cls(value) + + if dim is not None and value.dimensions() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dimensions())) + + return value.to_text() + + @classmethod + def _to_db_binary(cls, value): + if value is None: + return value + + if not isinstance(value, cls): + value = cls(value) + + return value.to_binary() + + @classmethod + def _from_db(cls, value): + if value is None or isinstance(value, cls): + return value + + return cls.from_text(value) + + @classmethod + def _from_db_binary(cls, value): + if value is None or isinstance(value, cls): + return value + + return cls.from_binary(value) diff --git a/pgvector/peewee/__init__.py b/pgvector/peewee/__init__.py index 148f8b3..df21200 100644 --- a/pgvector/peewee/__init__.py +++ b/pgvector/peewee/__init__.py @@ -1,31 +1,16 @@ -from peewee import Expression, Field, Value -from ..utils import from_db, to_db - - -class VectorField(Field): - field_type = 'vector' - - def __init__(self, dimensions=None, *args, **kwargs): - self.dimensions = dimensions - super(VectorField, self).__init__(*args, **kwargs) - - def get_modifiers(self): - return self.dimensions and [self.dimensions] or None - - def db_value(self, value): - return to_db(value) - - def python_value(self, value): - return from_db(value) - - def _distance(self, op, vector): - return Expression(lhs=self, op=op, rhs=self.to_value(vector)) - - def l2_distance(self, vector): - return self._distance('<->', vector) - - def max_inner_product(self, vector): - return self._distance('<#>', vector) - - def cosine_distance(self, vector): - return self._distance('<=>', vector) +from .bit import FixedBitField +from .halfvec import HalfVectorField +from .sparsevec import SparseVectorField +from .vector import VectorField + +# TODO remove +from .. import HalfVector, SparseVector + +__all__ = [ + 'VectorField', + 'HalfVectorField', + 'FixedBitField', + 'SparseVectorField', + 'HalfVector', + 'SparseVector' +] diff --git a/pgvector/peewee/bit.py b/pgvector/peewee/bit.py new file mode 100644 index 0000000..ee5f12f --- /dev/null +++ b/pgvector/peewee/bit.py @@ -0,0 +1,21 @@ +from peewee import Expression, Field + + +class FixedBitField(Field): + field_type = 'bit' + + def __init__(self, max_length=None, *args, **kwargs): + self.max_length = max_length + super(FixedBitField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.max_length and [self.max_length] or None + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def hamming_distance(self, vector): + return self._distance('<~>', vector) + + def jaccard_distance(self, vector): + return self._distance('<%%>', vector) diff --git a/pgvector/peewee/halfvec.py b/pgvector/peewee/halfvec.py new file mode 100644 index 0000000..0901fd2 --- /dev/null +++ b/pgvector/peewee/halfvec.py @@ -0,0 +1,34 @@ +from peewee import Expression, Field +from .. import HalfVector + + +class HalfVectorField(Field): + field_type = 'halfvec' + + def __init__(self, dimensions=None, *args, **kwargs): + self.dimensions = dimensions + super(HalfVectorField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.dimensions and [self.dimensions] or None + + def db_value(self, value): + return HalfVector._to_db(value) + + def python_value(self, value): + return HalfVector._from_db(value) + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def l2_distance(self, vector): + return self._distance('<->', vector) + + def max_inner_product(self, vector): + return self._distance('<#>', vector) + + def cosine_distance(self, vector): + return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) diff --git a/pgvector/peewee/sparsevec.py b/pgvector/peewee/sparsevec.py new file mode 100644 index 0000000..86dea73 --- /dev/null +++ b/pgvector/peewee/sparsevec.py @@ -0,0 +1,34 @@ +from peewee import Expression, Field +from .. import SparseVector + + +class SparseVectorField(Field): + field_type = 'sparsevec' + + def __init__(self, dimensions=None, *args, **kwargs): + self.dimensions = dimensions + super(SparseVectorField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.dimensions and [self.dimensions] or None + + def db_value(self, value): + return SparseVector._to_db(value) + + def python_value(self, value): + return SparseVector._from_db(value) + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def l2_distance(self, vector): + return self._distance('<->', vector) + + def max_inner_product(self, vector): + return self._distance('<#>', vector) + + def cosine_distance(self, vector): + return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) diff --git a/pgvector/peewee/vector.py b/pgvector/peewee/vector.py new file mode 100644 index 0000000..83f9997 --- /dev/null +++ b/pgvector/peewee/vector.py @@ -0,0 +1,34 @@ +from peewee import Expression, Field +from .. import Vector + + +class VectorField(Field): + field_type = 'vector' + + def __init__(self, dimensions=None, *args, **kwargs): + self.dimensions = dimensions + super(VectorField, self).__init__(*args, **kwargs) + + def get_modifiers(self): + return self.dimensions and [self.dimensions] or None + + def db_value(self, value): + return Vector._to_db(value) + + def python_value(self, value): + return Vector._from_db(value) + + def _distance(self, op, vector): + return Expression(lhs=self, op=op, rhs=self.to_value(vector)) + + def l2_distance(self, vector): + return self._distance('<->', vector) + + def max_inner_product(self, vector): + return self._distance('<#>', vector) + + def cosine_distance(self, vector): + return self._distance('<=>', vector) + + def l1_distance(self, vector): + return self._distance('<+>', vector) diff --git a/pgvector/pg8000/__init__.py b/pgvector/pg8000/__init__.py new file mode 100644 index 0000000..b3b4440 --- /dev/null +++ b/pgvector/pg8000/__init__.py @@ -0,0 +1,5 @@ +from .register import register_vector + +__all__ = [ + 'register_vector' +] diff --git a/pgvector/pg8000/register.py b/pgvector/pg8000/register.py new file mode 100644 index 0000000..15ee219 --- /dev/null +++ b/pgvector/pg8000/register.py @@ -0,0 +1,23 @@ +import numpy as np +from .. import Vector, HalfVector, SparseVector + + +def register_vector(conn): + # use to_regtype to get first matching type in search path + res = conn.run("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('halfvec'), to_regtype('sparsevec'))") + type_info = dict(res) + + if 'vector' not in type_info: + raise RuntimeError('vector type not found in the database') + + conn.register_out_adapter(Vector, Vector._to_db) + conn.register_out_adapter(np.ndarray, Vector._to_db) + conn.register_in_adapter(type_info['vector'], Vector._from_db) + + if 'halfvec' in type_info: + conn.register_out_adapter(HalfVector, HalfVector._to_db) + conn.register_in_adapter(type_info['halfvec'], HalfVector._from_db) + + if 'sparsevec' in type_info: + conn.register_out_adapter(SparseVector, SparseVector._to_db) + conn.register_in_adapter(type_info['sparsevec'], SparseVector._from_db) diff --git a/pgvector/psycopg/__init__.py b/pgvector/psycopg/__init__.py index bd398ec..980af84 100644 --- a/pgvector/psycopg/__init__.py +++ b/pgvector/psycopg/__init__.py @@ -1,69 +1,13 @@ -import psycopg -from psycopg.adapt import Loader, Dumper -from psycopg.pq import Format -from psycopg.types import TypeInfo -from ..utils import from_db, from_db_binary, to_db, to_db_binary - -__all__ = ['register_vector'] - - -class VectorDumper(Dumper): - - format = Format.TEXT - - def dump(self, obj): - return to_db(obj).encode("utf8") - - -class VectorBinaryDumper(VectorDumper): - - format = Format.BINARY - - def dump(self, obj): - return to_db_binary(obj) - - -class VectorLoader(Loader): - - format = Format.TEXT - - def load(self, data): - if isinstance(data, memoryview): - data = bytes(data) - return from_db(data.decode("utf8")) - - -class VectorBinaryLoader(VectorLoader): - - format = Format.BINARY - - def load(self, data): - if isinstance(data, memoryview): - data = bytes(data) - return from_db_binary(data) - - -def register_vector(context): - info = TypeInfo.fetch(context, 'vector') - register_vector_info(context, info) - - -async def register_vector_async(context): - info = await TypeInfo.fetch(context, 'vector') - register_vector_info(context, info) - - -def register_vector_info(context, info): - if info is None: - raise psycopg.ProgrammingError('vector type not found in the database') - info.register(context) - - # add oid to anonymous class for set_types - text_dumper = type('', (VectorDumper,), {'oid': info.oid}) - binary_dumper = type('', (VectorBinaryDumper,), {'oid': info.oid}) - - adapters = context.adapters - adapters.register_dumper('numpy.ndarray', text_dumper) - adapters.register_dumper('numpy.ndarray', binary_dumper) - adapters.register_loader(info.oid, VectorLoader) - adapters.register_loader(info.oid, VectorBinaryLoader) +from .register import register_vector, register_vector_async + +# TODO remove +from .. import Bit, HalfVector, SparseVector, Vector + +__all__ = [ + 'register_vector', + 'register_vector_async', + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] diff --git a/pgvector/psycopg/bit.py b/pgvector/psycopg/bit.py new file mode 100644 index 0000000..cffe8fb --- /dev/null +++ b/pgvector/psycopg/bit.py @@ -0,0 +1,31 @@ +from psycopg.adapt import Dumper +from psycopg.pq import Format +from .. import Bit + + +class BitDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return Bit._to_db(obj).encode('utf8') + + +class BitBinaryDumper(BitDumper): + + format = Format.BINARY + + def dump(self, obj): + return Bit._to_db_binary(obj) + + +def register_bit_info(context, info): + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (BitDumper,), {'oid': info.oid}) + binary_dumper = type('', (BitBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper(Bit, text_dumper) + adapters.register_dumper(Bit, binary_dumper) diff --git a/pgvector/psycopg/halfvec.py b/pgvector/psycopg/halfvec.py new file mode 100644 index 0000000..b3a0060 --- /dev/null +++ b/pgvector/psycopg/halfvec.py @@ -0,0 +1,53 @@ +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from .. import HalfVector + + +class HalfVectorDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return HalfVector._to_db(obj).encode('utf8') + + +class HalfVectorBinaryDumper(HalfVectorDumper): + + format = Format.BINARY + + def dump(self, obj): + return HalfVector._to_db_binary(obj) + + +class HalfVectorLoader(Loader): + + format = Format.TEXT + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return HalfVector._from_db(data.decode('utf8')) + + +class HalfVectorBinaryLoader(HalfVectorLoader): + + format = Format.BINARY + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return HalfVector._from_db_binary(data) + + +def register_halfvec_info(context, info): + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (HalfVectorDumper,), {'oid': info.oid}) + binary_dumper = type('', (HalfVectorBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper(HalfVector, text_dumper) + adapters.register_dumper(HalfVector, binary_dumper) + adapters.register_loader(info.oid, HalfVectorLoader) + adapters.register_loader(info.oid, HalfVectorBinaryLoader) diff --git a/pgvector/psycopg/register.py b/pgvector/psycopg/register.py new file mode 100644 index 0000000..b93fd3e --- /dev/null +++ b/pgvector/psycopg/register.py @@ -0,0 +1,37 @@ +from psycopg.types import TypeInfo +from .bit import register_bit_info +from .halfvec import register_halfvec_info +from .sparsevec import register_sparsevec_info +from .vector import register_vector_info + + +def register_vector(context): + info = TypeInfo.fetch(context, 'vector') + register_vector_info(context, info) + + info = TypeInfo.fetch(context, 'bit') + register_bit_info(context, info) + + info = TypeInfo.fetch(context, 'halfvec') + if info is not None: + register_halfvec_info(context, info) + + info = TypeInfo.fetch(context, 'sparsevec') + if info is not None: + register_sparsevec_info(context, info) + + +async def register_vector_async(context): + info = await TypeInfo.fetch(context, 'vector') + register_vector_info(context, info) + + info = await TypeInfo.fetch(context, 'bit') + register_bit_info(context, info) + + info = await TypeInfo.fetch(context, 'halfvec') + if info is not None: + register_halfvec_info(context, info) + + info = await TypeInfo.fetch(context, 'sparsevec') + if info is not None: + register_sparsevec_info(context, info) diff --git a/pgvector/psycopg/sparsevec.py b/pgvector/psycopg/sparsevec.py new file mode 100644 index 0000000..384a0e1 --- /dev/null +++ b/pgvector/psycopg/sparsevec.py @@ -0,0 +1,53 @@ +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from .. import SparseVector + + +class SparseVectorDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return SparseVector._to_db(obj).encode('utf8') + + +class SparseVectorBinaryDumper(SparseVectorDumper): + + format = Format.BINARY + + def dump(self, obj): + return SparseVector._to_db_binary(obj) + + +class SparseVectorLoader(Loader): + + format = Format.TEXT + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return SparseVector._from_db(data.decode('utf8')) + + +class SparseVectorBinaryLoader(SparseVectorLoader): + + format = Format.BINARY + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return SparseVector._from_db_binary(data) + + +def register_sparsevec_info(context, info): + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (SparseVectorDumper,), {'oid': info.oid}) + binary_dumper = type('', (SparseVectorBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper(SparseVector, text_dumper) + adapters.register_dumper(SparseVector, binary_dumper) + adapters.register_loader(info.oid, SparseVectorLoader) + adapters.register_loader(info.oid, SparseVectorBinaryLoader) diff --git a/pgvector/psycopg/vector.py b/pgvector/psycopg/vector.py new file mode 100644 index 0000000..db9e826 --- /dev/null +++ b/pgvector/psycopg/vector.py @@ -0,0 +1,58 @@ +import psycopg +from psycopg.adapt import Loader, Dumper +from psycopg.pq import Format +from .. import Vector + + +class VectorDumper(Dumper): + + format = Format.TEXT + + def dump(self, obj): + return Vector._to_db(obj).encode('utf8') + + +class VectorBinaryDumper(VectorDumper): + + format = Format.BINARY + + def dump(self, obj): + return Vector._to_db_binary(obj) + + +class VectorLoader(Loader): + + format = Format.TEXT + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return Vector._from_db(data.decode('utf8')) + + +class VectorBinaryLoader(VectorLoader): + + format = Format.BINARY + + def load(self, data): + if isinstance(data, memoryview): + data = bytes(data) + return Vector._from_db_binary(data) + + +def register_vector_info(context, info): + if info is None: + raise psycopg.ProgrammingError('vector type not found in the database') + info.register(context) + + # add oid to anonymous class for set_types + text_dumper = type('', (VectorDumper,), {'oid': info.oid}) + binary_dumper = type('', (VectorBinaryDumper,), {'oid': info.oid}) + + adapters = context.adapters + adapters.register_dumper('numpy.ndarray', text_dumper) + adapters.register_dumper('numpy.ndarray', binary_dumper) + adapters.register_dumper(Vector, text_dumper) + adapters.register_dumper(Vector, binary_dumper) + adapters.register_loader(info.oid, VectorLoader) + adapters.register_loader(info.oid, VectorBinaryLoader) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index 94cc081..33e5124 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,32 +1,10 @@ -import numpy as np -import psycopg2 -from psycopg2.extensions import adapt, new_type, register_adapter, register_type -from ..utils import from_db, to_db +from .register import register_vector -__all__ = ['register_vector'] +# TODO remove +from .. import HalfVector, SparseVector - -class VectorAdapter(object): - def __init__(self, vector): - self._vector = vector - - def getquoted(self): - return adapt(to_db(self._vector)).getquoted() - - -def cast_vector(value, cur): - return from_db(value) - - -def register_vector(conn_or_curs=None): - cur = conn_or_curs.cursor() if hasattr(conn_or_curs, 'cursor') else conn_or_curs - - try: - cur.execute('SELECT NULL::vector') - oid = cur.description[0][1] - except psycopg2.errors.UndefinedObject: - raise psycopg2.ProgrammingError('vector type not found in the database') - - vector = new_type((oid,), 'VECTOR', cast_vector) - register_type(vector) - register_adapter(np.ndarray, VectorAdapter) +__all__ = [ + 'register_vector', + 'HalfVector', + 'SparseVector' +] diff --git a/pgvector/psycopg2/halfvec.py b/pgvector/psycopg2/halfvec.py new file mode 100644 index 0000000..0a4c736 --- /dev/null +++ b/pgvector/psycopg2/halfvec.py @@ -0,0 +1,25 @@ +from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type +from .. import HalfVector + + +class HalfvecAdapter: + def __init__(self, value): + self._value = value + + def getquoted(self): + return adapt(HalfVector._to_db(self._value)).getquoted() + + +def cast_halfvec(value, cur): + return HalfVector._from_db(value) + + +def register_halfvec_info(oid, array_oid, scope): + halfvec = new_type((oid,), 'HALFVEC', cast_halfvec) + register_type(halfvec, scope) + + if array_oid is not None: + halfvecarray = new_array_type((array_oid,), 'HALFVECARRAY', halfvec) + register_type(halfvecarray, scope) + + register_adapter(HalfVector, HalfvecAdapter) diff --git a/pgvector/psycopg2/register.py b/pgvector/psycopg2/register.py new file mode 100644 index 0000000..1bc9d44 --- /dev/null +++ b/pgvector/psycopg2/register.py @@ -0,0 +1,27 @@ +import psycopg2 +from psycopg2.extensions import cursor +from .halfvec import register_halfvec_info +from .sparsevec import register_sparsevec_info +from .vector import register_vector_info + + +# note: register_adapter is always global +def register_vector(conn_or_curs, globally=False, arrays=True): + conn = conn_or_curs if hasattr(conn_or_curs, 'cursor') else conn_or_curs.connection + cur = conn.cursor(cursor_factory=cursor) + scope = None if globally else conn_or_curs + + # use to_regtype to get first matching type in search path + cur.execute("SELECT typname, oid FROM pg_type WHERE oid IN (to_regtype('vector'), to_regtype('_vector'), to_regtype('halfvec'), to_regtype('_halfvec'), to_regtype('sparsevec'), to_regtype('_sparsevec'))") + type_info = dict(cur.fetchall()) + + if 'vector' not in type_info: + raise psycopg2.ProgrammingError('vector type not found in the database') + + register_vector_info(type_info['vector'], type_info['_vector'] if arrays else None, scope) + + if 'halfvec' in type_info: + register_halfvec_info(type_info['halfvec'], type_info['_halfvec'] if arrays else None, scope) + + if 'sparsevec' in type_info: + register_sparsevec_info(type_info['sparsevec'], type_info['_sparsevec'] if arrays else None, scope) diff --git a/pgvector/psycopg2/sparsevec.py b/pgvector/psycopg2/sparsevec.py new file mode 100644 index 0000000..148eff2 --- /dev/null +++ b/pgvector/psycopg2/sparsevec.py @@ -0,0 +1,25 @@ +from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type +from .. import SparseVector + + +class SparsevecAdapter: + def __init__(self, value): + self._value = value + + def getquoted(self): + return adapt(SparseVector._to_db(self._value)).getquoted() + + +def cast_sparsevec(value, cur): + return SparseVector._from_db(value) + + +def register_sparsevec_info(oid, array_oid, scope): + sparsevec = new_type((oid,), 'SPARSEVEC', cast_sparsevec) + register_type(sparsevec, scope) + + if array_oid is not None: + sparsevecarray = new_array_type((array_oid,), 'SPARSEVECARRAY', sparsevec) + register_type(sparsevecarray, scope) + + register_adapter(SparseVector, SparsevecAdapter) diff --git a/pgvector/psycopg2/vector.py b/pgvector/psycopg2/vector.py new file mode 100644 index 0000000..562de18 --- /dev/null +++ b/pgvector/psycopg2/vector.py @@ -0,0 +1,27 @@ +import numpy as np +from psycopg2.extensions import adapt, new_array_type, new_type, register_adapter, register_type +from .. import Vector + + +class VectorAdapter: + def __init__(self, value): + self._value = value + + def getquoted(self): + return adapt(Vector._to_db(self._value)).getquoted() + + +def cast_vector(value, cur): + return Vector._from_db(value) + + +def register_vector_info(oid, array_oid, scope): + vector = new_type((oid,), 'VECTOR', cast_vector) + register_type(vector, scope) + + if array_oid is not None: + vectorarray = new_array_type((array_oid,), 'VECTORARRAY', vector) + register_type(vectorarray, scope) + + register_adapter(np.ndarray, VectorAdapter) + register_adapter(Vector, VectorAdapter) diff --git a/pgvector/sparsevec.py b/pgvector/sparsevec.py new file mode 100644 index 0000000..895fbd0 --- /dev/null +++ b/pgvector/sparsevec.py @@ -0,0 +1,161 @@ +import numpy as np +from struct import pack, unpack_from + +NO_DEFAULT = object() + + +class SparseVector: + def __init__(self, value, dimensions=NO_DEFAULT, /): + if value.__class__.__module__.startswith('scipy.sparse.'): + if dimensions is not NO_DEFAULT: + raise ValueError('extra argument') + + self._from_sparse(value) + elif isinstance(value, dict): + if dimensions is NO_DEFAULT: + raise ValueError('missing dimensions') + + self._from_dict(value, dimensions) + else: + if dimensions is not NO_DEFAULT: + raise ValueError('extra argument') + + self._from_dense(value) + + def __repr__(self): + elements = dict(zip(self._indices, self._values)) + return f'SparseVector({elements}, {self._dim})' + + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.dimensions() == other.dimensions() and self.indices() == other.indices() and self.values() == other.values() + return False + + def dimensions(self): + return self._dim + + def indices(self): + return self._indices + + def values(self): + return self._values + + def to_coo(self): + from scipy.sparse import coo_array + + coords = ([0] * len(self._indices), self._indices) + return coo_array((self._values, coords), shape=(1, self._dim)) + + def to_list(self): + vec = [0.0] * self._dim + for i, v in zip(self._indices, self._values): + vec[i] = v + return vec + + def to_numpy(self): + vec = np.repeat(0.0, self._dim).astype(np.float32) + for i, v in zip(self._indices, self._values): + vec[i] = v + return vec + + def to_text(self): + return '{' + ','.join([f'{int(i) + 1}:{float(v)}' for i, v in zip(self._indices, self._values)]) + '}/' + str(int(self._dim)) + + def to_binary(self): + nnz = len(self._indices) + return pack(f'>iii{nnz}i{nnz}f', self._dim, nnz, 0, *self._indices, *self._values) + + def _from_dict(self, d, dim): + elements = [(i, v) for i, v in d.items() if v != 0] + elements.sort() + + self._dim = int(dim) + self._indices = [int(v[0]) for v in elements] + self._values = [float(v[1]) for v in elements] + + def _from_sparse(self, value): + value = value.tocoo() + + if value.ndim == 1: + self._dim = value.shape[0] + elif value.ndim == 2 and value.shape[0] == 1: + self._dim = value.shape[1] + else: + raise ValueError('expected ndim to be 1') + + if hasattr(value, 'coords'): + # scipy 1.13+ + self._indices = value.coords[-1].tolist() + else: + self._indices = value.col.tolist() + self._values = value.data.tolist() + + def _from_dense(self, value): + self._dim = len(value) + self._indices = [i for i, v in enumerate(value) if v != 0] + self._values = [float(value[i]) for i in self._indices] + + @classmethod + def from_text(cls, value): + elements, dim = value.split('/', 2) + indices = [] + values = [] + # split on empty string returns single element list + if len(elements) > 2: + for e in elements[1:-1].split(','): + i, v = e.split(':', 2) + indices.append(int(i) - 1) + values.append(float(v)) + return cls._from_parts(int(dim), indices, values) + + @classmethod + def from_binary(cls, value): + dim, nnz, unused = unpack_from('>iii', value) + indices = unpack_from(f'>{nnz}i', value, 12) + values = unpack_from(f'>{nnz}f', value, 12 + nnz * 4) + return cls._from_parts(int(dim), list(indices), list(values)) + + @classmethod + def _from_parts(cls, dim, indices, values): + vec = cls.__new__(cls) + vec._dim = dim + vec._indices = indices + vec._values = values + return vec + + @classmethod + def _to_db(cls, value, dim=None): + if value is None: + return value + + if not isinstance(value, cls): + value = cls(value) + + if dim is not None and value.dimensions() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dimensions())) + + return value.to_text() + + @classmethod + def _to_db_binary(cls, value): + if value is None: + return value + + if not isinstance(value, cls): + value = cls(value) + + return value.to_binary() + + @classmethod + def _from_db(cls, value): + if value is None or isinstance(value, cls): + return value + + return cls.from_text(value) + + @classmethod + def _from_db_binary(cls, value): + if value is None or isinstance(value, cls): + return value + + return cls.from_binary(value) diff --git a/pgvector/sqlalchemy/__init__.py b/pgvector/sqlalchemy/__init__.py index 1cf5b66..52adf88 100644 --- a/pgvector/sqlalchemy/__init__.py +++ b/pgvector/sqlalchemy/__init__.py @@ -1,37 +1,21 @@ -from sqlalchemy.types import UserDefinedType, Float -from ..utils import from_db, to_db - -__all__ = ['Vector'] - - -class Vector(UserDefinedType): - cache_ok = True - - def __init__(self, dim=None): - super(UserDefinedType, self).__init__() - self.dim = dim - - def get_col_spec(self, **kw): - if self.dim is None: - return "VECTOR" - return "VECTOR(%d)" % self.dim - - def bind_processor(self, dialect): - def process(value): - return to_db(value, self.dim) - return process - - def result_processor(self, dialect, coltype): - def process(value): - return from_db(value) - return process - - class comparator_factory(UserDefinedType.Comparator): - def l2_distance(self, other): - return self.op('<->', return_type=Float)(other) - - def max_inner_product(self, other): - return self.op('<#>', return_type=Float)(other) - - def cosine_distance(self, other): - return self.op('<=>', return_type=Float)(other) +from .bit import BIT +from .functions import avg, sum +from .halfvec import HALFVEC +from .sparsevec import SPARSEVEC +from .vector import VECTOR +from .vector import VECTOR as Vector + +# TODO remove +from .. import HalfVector, SparseVector + +__all__ = [ + 'Vector', + 'VECTOR', + 'HALFVEC', + 'BIT', + 'SPARSEVEC', + 'HalfVector', + 'SparseVector', + 'avg', + 'sum' +] diff --git a/pgvector/sqlalchemy/bit.py b/pgvector/sqlalchemy/bit.py new file mode 100644 index 0000000..0f83f3c --- /dev/null +++ b/pgvector/sqlalchemy/bit.py @@ -0,0 +1,26 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float + + +class BIT(UserDefinedType): + cache_ok = True + + def __init__(self, length=None): + super(UserDefinedType, self).__init__() + self.length = length + + def get_col_spec(self, **kw): + if self.length is None: + return 'BIT' + return 'BIT(%d)' % self.length + + class comparator_factory(UserDefinedType.Comparator): + def hamming_distance(self, other): + return self.op('<~>', return_type=Float)(other) + + def jaccard_distance(self, other): + return self.op('<%>', return_type=Float)(other) + + +# for reflection +ischema_names['bit'] = BIT diff --git a/pgvector/sqlalchemy/functions.py b/pgvector/sqlalchemy/functions.py new file mode 100644 index 0000000..72e3ca7 --- /dev/null +++ b/pgvector/sqlalchemy/functions.py @@ -0,0 +1,14 @@ +# https://docs.sqlalchemy.org/en/20/core/functions.html +# include sum for a consistent API +from sqlalchemy.sql.functions import ReturnTypeFromArgs, sum + + +class avg(ReturnTypeFromArgs): + inherit_cache = True + package = 'pgvector' + + +__all__ = [ + 'avg', + 'sum' +] diff --git a/pgvector/sqlalchemy/halfvec.py b/pgvector/sqlalchemy/halfvec.py new file mode 100644 index 0000000..10688b5 --- /dev/null +++ b/pgvector/sqlalchemy/halfvec.py @@ -0,0 +1,51 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float, String +from .. import HalfVector + + +class HALFVEC(UserDefinedType): + cache_ok = True + _string = String() + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + if self.dim is None: + return 'HALFVEC' + return 'HALFVEC(%d)' % self.dim + + def bind_processor(self, dialect): + def process(value): + return HalfVector._to_db(value, self.dim) + return process + + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + + def process(value): + return string_literal_processor(HalfVector._to_db(value, self.dim)) + return process + + def result_processor(self, dialect, coltype): + def process(value): + return HalfVector._from_db(value) + return process + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op('<->', return_type=Float)(other) + + def max_inner_product(self, other): + return self.op('<#>', return_type=Float)(other) + + def cosine_distance(self, other): + return self.op('<=>', return_type=Float)(other) + + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + + +# for reflection +ischema_names['halfvec'] = HALFVEC diff --git a/pgvector/sqlalchemy/sparsevec.py b/pgvector/sqlalchemy/sparsevec.py new file mode 100644 index 0000000..0058679 --- /dev/null +++ b/pgvector/sqlalchemy/sparsevec.py @@ -0,0 +1,51 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float, String +from .. import SparseVector + + +class SPARSEVEC(UserDefinedType): + cache_ok = True + _string = String() + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + if self.dim is None: + return 'SPARSEVEC' + return 'SPARSEVEC(%d)' % self.dim + + def bind_processor(self, dialect): + def process(value): + return SparseVector._to_db(value, self.dim) + return process + + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + + def process(value): + return string_literal_processor(SparseVector._to_db(value, self.dim)) + return process + + def result_processor(self, dialect, coltype): + def process(value): + return SparseVector._from_db(value) + return process + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op('<->', return_type=Float)(other) + + def max_inner_product(self, other): + return self.op('<#>', return_type=Float)(other) + + def cosine_distance(self, other): + return self.op('<=>', return_type=Float)(other) + + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + + +# for reflection +ischema_names['sparsevec'] = SPARSEVEC diff --git a/pgvector/sqlalchemy/vector.py b/pgvector/sqlalchemy/vector.py new file mode 100644 index 0000000..5a1e11f --- /dev/null +++ b/pgvector/sqlalchemy/vector.py @@ -0,0 +1,51 @@ +from sqlalchemy.dialects.postgresql.base import ischema_names +from sqlalchemy.types import UserDefinedType, Float, String +from .. import Vector + + +class VECTOR(UserDefinedType): + cache_ok = True + _string = String() + + def __init__(self, dim=None): + super(UserDefinedType, self).__init__() + self.dim = dim + + def get_col_spec(self, **kw): + if self.dim is None: + return 'VECTOR' + return 'VECTOR(%d)' % self.dim + + def bind_processor(self, dialect): + def process(value): + return Vector._to_db(value, self.dim) + return process + + def literal_processor(self, dialect): + string_literal_processor = self._string._cached_literal_processor(dialect) + + def process(value): + return string_literal_processor(Vector._to_db(value, self.dim)) + return process + + def result_processor(self, dialect, coltype): + def process(value): + return Vector._from_db(value) + return process + + class comparator_factory(UserDefinedType.Comparator): + def l2_distance(self, other): + return self.op('<->', return_type=Float)(other) + + def max_inner_product(self, other): + return self.op('<#>', return_type=Float)(other) + + def cosine_distance(self, other): + return self.op('<=>', return_type=Float)(other) + + def l1_distance(self, other): + return self.op('<+>', return_type=Float)(other) + + +# for reflection +ischema_names['vector'] = VECTOR diff --git a/pgvector/utils/__init__.py b/pgvector/utils/__init__.py index 5640b60..8cdb5d6 100644 --- a/pgvector/utils/__init__.py +++ b/pgvector/utils/__init__.py @@ -1,49 +1,9 @@ -import numpy as np -from struct import pack, unpack - - -def from_db(value): - # could be ndarray if already cast by lower-level driver - if value is None or isinstance(value, np.ndarray): - return value - - return np.array(value[1:-1].split(','), dtype=np.float32) - - -def from_db_binary(value): - if value is None: - return value - - (dim, unused) = unpack('>HH', value[:4]) - return np.frombuffer(value, dtype='>f', count=dim, offset=4).astype(dtype=np.float32) - - -def to_db(value, dim=None): - if value is None: - return value - - if isinstance(value, np.ndarray): - if value.ndim != 1: - raise ValueError('expected ndim to be 1') - - if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): - raise ValueError('dtype must be numeric') - - value = value.tolist() - - if dim is not None and len(value) != dim: - raise ValueError('expected %d dimensions, not %d' % (dim, len(value))) - - return '[' + ','.join([str(float(v)) for v in value]) + ']' - - -def to_db_binary(value): - if value is None: - return value - - value = np.asarray(value, dtype='>f') - - if value.ndim != 1: - raise ValueError('expected ndim to be 1') - - return pack('>HH', value.shape[0], 0) + value.tobytes() +# TODO remove +from .. import Bit, HalfVector, SparseVector, Vector + +__all__ = [ + 'Vector', + 'HalfVector', + 'Bit', + 'SparseVector' +] diff --git a/pgvector/vector.py b/pgvector/vector.py new file mode 100644 index 0000000..ebbcafd --- /dev/null +++ b/pgvector/vector.py @@ -0,0 +1,83 @@ +import numpy as np +from struct import pack, unpack_from + + +class Vector: + def __init__(self, value): + # asarray still copies if same dtype + if not isinstance(value, np.ndarray) or value.dtype != '>f4': + value = np.asarray(value, dtype='>f4') + + if value.ndim != 1: + raise ValueError('expected ndim to be 1') + + self._value = value + + def __repr__(self): + return f'Vector({self.to_list()})' + + def __eq__(self, other): + if isinstance(other, self.__class__): + return np.array_equal(self.to_numpy(), other.to_numpy()) + return False + + def dimensions(self): + return len(self._value) + + def to_list(self): + return self._value.tolist() + + def to_numpy(self): + return self._value + + def to_text(self): + return '[' + ','.join([str(float(v)) for v in self._value]) + ']' + + def to_binary(self): + return pack('>HH', self.dimensions(), 0) + self._value.tobytes() + + @classmethod + def from_text(cls, value): + return cls([float(v) for v in value[1:-1].split(',')]) + + @classmethod + def from_binary(cls, value): + dim, unused = unpack_from('>HH', value) + return cls(np.frombuffer(value, dtype='>f4', count=dim, offset=4)) + + @classmethod + def _to_db(cls, value, dim=None): + if value is None: + return value + + if not isinstance(value, cls): + value = cls(value) + + if dim is not None and value.dimensions() != dim: + raise ValueError('expected %d dimensions, not %d' % (dim, value.dimensions())) + + return value.to_text() + + @classmethod + def _to_db_binary(cls, value): + if value is None: + return value + + if not isinstance(value, cls): + value = cls(value) + + return value.to_binary() + + @classmethod + def _from_db(cls, value): + if value is None or isinstance(value, np.ndarray): + return value + + return cls.from_text(value).to_numpy().astype(np.float32) + + @classmethod + def _from_db_binary(cls, value): + if value is None or isinstance(value, np.ndarray): + return value + + return cls.from_binary(value).to_numpy().astype(np.float32) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0cfa183 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "pgvector" +version = "0.4.1" +description = "pgvector support for Python" +readme = "README.md" +authors = [ + {name = "Andrew Kane", email = "andrew@ankane.org"} +] +license = "MIT" +requires-python = ">= 3.9" +dependencies = [ + "numpy" +] + +[project.urls] +Homepage = "https://github.com/pgvector/pgvector-python" + +[tool.pytest.ini_options] +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 2f4c80e..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -asyncio_mode = auto diff --git a/requirements.txt b/requirements.txt index e111e7d..a13be06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,11 @@ asyncpg Django numpy peewee -psycopg[binary] +pg8000 +psycopg[binary,pool] psycopg2-binary pytest pytest-asyncio -SQLAlchemy +scipy +SQLAlchemy[asyncio]>=2 +sqlmodel>=0.0.12 diff --git a/setup.py b/setup.py deleted file mode 100644 index 227b6cf..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -from setuptools import setup - -with open('README.md', 'r', encoding='utf-8') as fh: - long_description = fh.read() - -setup( - name='pgvector', - version='0.2.3', - description='pgvector support for Python', - long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/pgvector/pgvector-python', - author='Andrew Kane', - author_email='andrew@ankane.org', - license='MIT', - packages=[ - 'pgvector.asyncpg', - 'pgvector.django', - 'pgvector.peewee', - 'pgvector.psycopg', - 'pgvector.psycopg2', - 'pgvector.sqlalchemy', - 'pgvector.utils' - ], - python_requires='>=3.8', - install_requires=[ - 'numpy' - ], - zip_safe=False -) diff --git a/tests/test_asyncpg.py b/tests/test_asyncpg.py index 2634310..34d66a1 100644 --- a/tests/test_asyncpg.py +++ b/tests/test_asyncpg.py @@ -1,36 +1,128 @@ -import asyncio import asyncpg import numpy as np +from pgvector import HalfVector, SparseVector, Vector from pgvector.asyncpg import register_vector import pytest class TestAsyncpg: @pytest.mark.asyncio - async def test_works(self): + async def test_vector(self): conn = await asyncpg.connect(database='pgvector_python_test') await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector(conn) - embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO item (embedding) VALUES ($1), (NULL)", embedding) + embedding = Vector([1.5, 2, 3]) + embedding2 = np.array([4.5, 5, 6]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) - res = await conn.fetch("SELECT * FROM item ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 - assert np.array_equal(res[0]['embedding'], embedding) + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert np.array_equal(res[0]['embedding'], embedding.to_numpy()) assert res[0]['embedding'].dtype == np.float32 - assert res[1]['embedding'] is None + assert np.array_equal(res[1]['embedding'], embedding2) + assert res[2]['embedding'] is None + + # ensures binary format is correct + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") + assert text_res[0]['embedding'] == '[1.5,2,3]' + + await conn.close() + + @pytest.mark.asyncio + async def test_halfvec(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding halfvec(3))') + + await register_vector(conn) + + embedding = HalfVector([1.5, 2, 3]) + embedding2 = [4.5, 5, 6] + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) + + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert res[0]['embedding'] == embedding + assert res[1]['embedding'] == HalfVector(embedding2) + assert res[2]['embedding'] is None # ensures binary format is correct - text_res = await conn.fetch("SELECT embedding::text FROM item ORDER BY id LIMIT 1") + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") assert text_res[0]['embedding'] == '[1.5,2,3]' await conn.close() + @pytest.mark.asyncio + async def test_bit(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding bit(3))') + + await register_vector(conn) + + embedding = asyncpg.BitString('101') + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert res[0]['embedding'].as_string() == '101' + assert res[0]['embedding'].to_int() == 5 + assert res[1]['embedding'] is None + + # ensures binary format is correct + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") + assert text_res[0]['embedding'] == '101' + + await conn.close() + + @pytest.mark.asyncio + async def test_sparsevec(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding sparsevec(3))') + + await register_vector(conn) + + embedding = SparseVector([1.5, 2, 3]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), (NULL)", embedding) + + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert res[0]['embedding'] == embedding + assert res[1]['embedding'] is None + + # ensures binary format is correct + text_res = await conn.fetch("SELECT embedding::text FROM asyncpg_items ORDER BY id LIMIT 1") + assert text_res[0]['embedding'] == '{1:1.5,2:2,3:3}/3' + + await conn.close() + + @pytest.mark.asyncio + async def test_vector_array(self): + conn = await asyncpg.connect(database='pgvector_python_test') + await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embeddings vector[])') + + await register_vector(conn) + + embeddings = [Vector([1.5, 2, 3]), Vector([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES ($1)", embeddings) + + embeddings2 = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + await conn.execute("INSERT INTO asyncpg_items (embeddings) VALUES (ARRAY[$1, $2]::vector[])", embeddings2[0], embeddings2[1]) + + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert np.array_equal(res[0]['embeddings'][0], embeddings[0].to_numpy()) + assert np.array_equal(res[0]['embeddings'][1], embeddings[1].to_numpy()) + assert np.array_equal(res[1]['embeddings'][0], embeddings2[0]) + assert np.array_equal(res[1]['embeddings'][1], embeddings2[1]) + + await conn.close() + @pytest.mark.asyncio async def test_pool(self): async def init(conn): @@ -40,15 +132,15 @@ async def init(conn): async with pool.acquire() as conn: await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS asyncpg_items') + await conn.execute('CREATE TABLE asyncpg_items (id bigserial PRIMARY KEY, embedding vector(3))') - embedding = np.array([1.5, 2, 3]) - await conn.execute("INSERT INTO item (embedding) VALUES ($1), (NULL)", embedding) + embedding = Vector([1.5, 2, 3]) + embedding2 = np.array([1.5, 2, 3]) + await conn.execute("INSERT INTO asyncpg_items (embedding) VALUES ($1), ($2), (NULL)", embedding, embedding2) - res = await conn.fetch("SELECT * FROM item ORDER BY id") - assert res[0]['id'] == 1 - assert res[1]['id'] == 2 - assert np.array_equal(res[0]['embedding'], embedding) + res = await conn.fetch("SELECT * FROM asyncpg_items ORDER BY id") + assert np.array_equal(res[0]['embedding'], embedding.to_numpy()) assert res[0]['embedding'].dtype == np.float32 - assert res[1]['embedding'] is None + assert np.array_equal(res[1]['embedding'], embedding2) + assert res[2]['embedding'] is None diff --git a/tests/test_bit.py b/tests/test_bit.py new file mode 100644 index 0000000..5a71642 --- /dev/null +++ b/tests/test_bit.py @@ -0,0 +1,63 @@ +import numpy as np +from pgvector import Bit +import pytest + + +class TestBit: + def test_list(self): + assert Bit([True, False, True]).to_list() == [True, False, True] + + def test_list_none(self): + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([True, None, True]).to_text() == '101' + + def test_list_int(self): + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([254, 7, 0]).to_text() == '110' + + def test_tuple(self): + assert Bit((True, False, True)).to_list() == [True, False, True] + + def test_str(self): + assert Bit('101').to_list() == [True, False, True] + + def test_bytes(self): + assert Bit(b'\xff\x00\xf0').to_text() == '111111110000000011110000' + assert Bit(b'\xfe\x07\x00').to_text() == '111111100000011100000000' + + def test_ndarray(self): + arr = np.array([True, False, True]) + assert Bit(arr).to_list() == [True, False, True] + assert np.array_equal(Bit(arr).to_numpy(), arr) + + def test_ndarray_unpackbits(self): + arr = np.unpackbits(np.array([254, 7, 0], dtype=np.uint8)) + assert Bit(arr).to_text() == '111111100000011100000000' + + def test_ndarray_uint8(self): + arr = np.array([254, 7, 0], dtype=np.uint8) + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' + + def test_ndarray_uint16(self): + arr = np.array([254, 7, 0], dtype=np.uint16) + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' + + def test_ndim_two(self): + with pytest.raises(ValueError) as error: + Bit([[True, False], [True, False]]) + assert str(error.value) == 'expected ndim to be 1' + + def test_ndim_zero(self): + with pytest.raises(ValueError) as error: + Bit(True) + assert str(error.value) == 'expected ndim to be 1' + + def test_repr(self): + assert repr(Bit([True, False, True])) == 'Bit(101)' + assert str(Bit([True, False, True])) == 'Bit(101)' + + def test_equality(self): + assert Bit([True, False, True]) == Bit([True, False, True]) + assert Bit([True, False, True]) != Bit([True, False, False]) diff --git a/tests/test_django.py b/tests/test_django.py index 5b9c386..7a8a6eb 100644 --- a/tests/test_django.py +++ b/tests/test_django.py @@ -1,14 +1,19 @@ import django from django.conf import settings +from django.contrib.postgres.fields import ArrayField +from django.contrib.postgres.indexes import OpClass from django.core import serializers from django.db import connection, migrations, models -from django.db.models import Avg, Sum +from django.db.models import Avg, Sum, FloatField, DecimalField +from django.db.models.functions import Cast from django.db.migrations.loader import MigrationLoader from django.forms import ModelForm from math import sqrt import numpy as np +import os import pgvector.django -from pgvector.django import VectorExtension, VectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance +from pgvector import HalfVector, SparseVector +from pgvector.django import VectorExtension, VectorField, HalfVectorField, BitField, SparseVectorField, IvfflatIndex, HnswIndex, L2Distance, MaxInnerProduct, CosineDistance, L1Distance, HammingDistance, JaccardDistance from unittest import mock settings.configure( @@ -17,16 +22,45 @@ 'ENGINE': 'django.db.backends.postgresql', 'NAME': 'pgvector_python_test', } - } + }, + DEBUG=('VERBOSE' in os.environ), + LOGGING={ + 'version': 1, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler' + } + }, + 'loggers': { + 'django.db.backends': { + 'handlers': ['console'], + 'level': 'DEBUG' + }, + 'django.db.backends.schema': { + 'level': 'WARNING' + } + } + }, + # needed for OpClass + # https://docs.djangoproject.com/en/5.1/ref/contrib/postgres/indexes/#opclass-expressions + INSTALLED_APPS=[ + 'django.contrib.postgres' + ] ) django.setup() class Item(models.Model): - embedding = VectorField(dimensions=3) + embedding = VectorField(dimensions=3, null=True, blank=True) + half_embedding = HalfVectorField(dimensions=3, null=True, blank=True) + binary_embedding = BitField(length=3, null=True, blank=True) + sparse_embedding = SparseVectorField(dimensions=3, null=True, blank=True) + embeddings = ArrayField(VectorField(dimensions=3), null=True, blank=True) + double_embedding = ArrayField(FloatField(), null=True, blank=True) + numeric_embedding = ArrayField(DecimalField(max_digits=20, decimal_places=10), null=True, blank=True) class Meta: - app_label = 'myapp' + app_label = 'django_app' indexes = [ IvfflatIndex( name='ivfflat_idx', @@ -38,8 +72,14 @@ class Meta: name='hnsw_idx', fields=['embedding'], m=16, - ef_construction=100, + ef_construction=64, opclasses=['vector_l2_ops'] + ), + HnswIndex( + OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), + name='hnsw_half_precision_idx', + m=16, + ef_construction=64 ) ] @@ -47,107 +87,237 @@ class Meta: class Migration(migrations.Migration): initial = True - dependencies = [ - ] - operations = [ VectorExtension(), migrations.CreateModel( name='Item', fields=[ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('embedding', pgvector.django.VectorField(dimensions=3, null=True)), + ('embedding', pgvector.django.VectorField(dimensions=3, null=True, blank=True)), + ('half_embedding', pgvector.django.HalfVectorField(dimensions=3, null=True, blank=True)), + ('binary_embedding', pgvector.django.BitField(length=3, null=True, blank=True)), + ('sparse_embedding', pgvector.django.SparseVectorField(dimensions=3, null=True, blank=True)), + ('embeddings', ArrayField(pgvector.django.VectorField(dimensions=3), null=True, blank=True)), + ('double_embedding', ArrayField(FloatField(), null=True, blank=True)), + ('numeric_embedding', ArrayField(DecimalField(max_digits=20, decimal_places=10), null=True, blank=True)), ], ), migrations.AddIndex( model_name='item', - index=pgvector.django.IvfflatIndex(fields=['embedding'], lists=1, name='my_index', opclasses=['vector_l2_ops']), + index=pgvector.django.IvfflatIndex(fields=['embedding'], lists=1, name='ivfflat_idx', opclasses=['vector_l2_ops']), + ), + migrations.AddIndex( + model_name='item', + index=pgvector.django.HnswIndex(fields=['embedding'], m=16, ef_construction=64, name='hnsw_idx', opclasses=['vector_l2_ops']), + ), + migrations.AddIndex( + model_name='item', + index=pgvector.django.HnswIndex(OpClass(Cast('embedding', HalfVectorField(dimensions=3)), name='halfvec_l2_ops'), m=16, ef_construction=64, name='hnsw_half_precision_idx'), ) ] # probably a better way to do this -migration = Migration('initial', 'myapp') +migration = Migration('initial', 'django_app') loader = MigrationLoader(connection, replace_migrations=False) -loader.graph.add_node(('myapp', migration.name), migration) +loader.graph.add_node(('django_app', migration.name), migration) sql_statements = loader.collect_sql([(migration, False)]) with connection.cursor() as cursor: - cursor.execute("DROP TABLE IF EXISTS myapp_item") + cursor.execute("DROP TABLE IF EXISTS django_app_item") cursor.execute('\n'.join(sql_statements)) def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] - for i, v in enumerate(vectors): - item = Item(id=i + 1, embedding=v) - item.save() + Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1])).save() + Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2])).save() + Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2])).save() -class ItemForm(ModelForm): +class VectorForm(ModelForm): class Meta: model = Item fields = ['embedding'] +class HalfVectorForm(ModelForm): + class Meta: + model = Item + fields = ['half_embedding'] + + +class BitForm(ModelForm): + class Meta: + model = Item + fields = ['binary_embedding'] + + +class SparseVectorForm(ModelForm): + class Meta: + model = Item + fields = ['sparse_embedding'] + + class TestDjango: - def setup_method(self, test_method): + def setup_method(self): Item.objects.all().delete() - def test_works(self): - item = Item(id=1, embedding=[1, 2, 3]) - item.save() + def test_vector(self): + Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - assert item.id == 1 - assert np.array_equal(item.embedding, np.array([1, 2, 3])) + assert np.array_equal(item.embedding, [1, 2, 3]) assert item.embedding.dtype == np.float32 - def test_l2_distance(self): + def test_vector_l2_distance(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] - def test_max_inner_product(self): + def test_vector_max_inner_product(self): create_items() distance = MaxInnerProduct('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] - def test_cosine_distance(self): + def test_vector_cosine_distance(self): create_items() distance = CosineDistance('embedding', [1, 1, 1]) items = Item.objects.annotate(distance=distance).order_by(distance) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + def test_vector_l1_distance(self): + create_items() + distance = L1Distance('embedding', [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + + def test_halfvec(self): + Item(id=1, half_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + assert item.half_embedding == HalfVector([1, 2, 3]) + + def test_halfvec_l2_distance(self): + create_items() + distance = L2Distance('half_embedding', HalfVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + + def test_halfvec_max_inner_product(self): + create_items() + distance = MaxInnerProduct('half_embedding', HalfVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_halfvec_cosine_distance(self): + create_items() + distance = CosineDistance('half_embedding', HalfVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_halfvec_l1_distance(self): + create_items() + distance = L1Distance('half_embedding', HalfVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + + def test_bit(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + distance = HammingDistance('binary_embedding', '101') + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [0, 1, 2] + + def test_bit_jaccard_distance(self): + create_items() + distance = JaccardDistance('binary_embedding', '101') + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + # assert [v.distance for v in items] == [0, 1/3, 1] + + def test_sparsevec(self): + Item(id=1, sparse_embedding=SparseVector([1, 2, 3])).save() + item = Item.objects.get(pk=1) + assert item.sparse_embedding == SparseVector([1, 2, 3]) + + def test_sparsevec_l2_distance(self): + create_items() + distance = L2Distance('sparse_embedding', SparseVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + + def test_sparsevec_max_inner_product(self): + create_items() + distance = MaxInnerProduct('sparse_embedding', SparseVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_sparsevec_cosine_distance(self): + create_items() + distance = CosineDistance('sparse_embedding', SparseVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_sparsevec_l1_distance(self): + create_items() + distance = L1Distance('sparse_embedding', SparseVector([1, 1, 1])) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_filter(self): create_items() distance = L2Distance('embedding', [1, 1, 1]) items = Item.objects.alias(distance=distance).filter(distance__lt=1) assert [v.id for v in items] == [1] - def test_avg(self): + def test_vector_avg(self): avg = Item.objects.aggregate(Avg('embedding'))['embedding__avg'] assert avg is None Item(embedding=[1, 2, 3]).save() Item(embedding=[4, 5, 6]).save() avg = Item.objects.aggregate(Avg('embedding'))['embedding__avg'] - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + assert np.array_equal(avg, [2.5, 3.5, 4.5]) - def test_sum(self): + def test_vector_sum(self): sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] assert sum is None Item(embedding=[1, 2, 3]).save() Item(embedding=[4, 5, 6]).save() sum = Item.objects.aggregate(Sum('embedding'))['embedding__sum'] - assert np.array_equal(sum, np.array([5, 7, 9])) + assert np.array_equal(sum, [5, 7, 9]) + + def test_halfvec_avg(self): + avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] + assert avg is None + Item(half_embedding=[1, 2, 3]).save() + Item(half_embedding=[4, 5, 6]).save() + avg = Item.objects.aggregate(Avg('half_embedding'))['half_embedding__avg'] + assert avg == HalfVector([2.5, 3.5, 4.5]) + + def test_halfvec_sum(self): + sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] + assert sum is None + Item(half_embedding=[1, 2, 3]).save() + Item(half_embedding=[4, 5, 6]).save() + sum = Item.objects.aggregate(Sum('half_embedding'))['half_embedding__sum'] + assert sum == HalfVector([5, 7, 9]) def test_serialization(self): create_items() @@ -159,28 +329,121 @@ def test_serialization(self): for obj in serializers.deserialize(format, data): obj.save() - def test_form(self): - form = ItemForm(data={'embedding': '[1, 2, 3]'}) + def test_vector_form(self): + form = VectorForm(data={'embedding': '[1, 2, 3]'}) assert form.is_valid() assert 'value="[1, 2, 3]"' in form.as_div() - def test_form_instance(self): + def test_vector_form_instance(self): Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - form = ItemForm(instance=item) + form = VectorForm(instance=item) assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() - def test_form_save(self): + def test_vector_form_save(self): Item(id=1, embedding=[1, 2, 3]).save() item = Item.objects.get(pk=1) - form = ItemForm(instance=item, data={'embedding': '[4, 5, 6]'}) + form = VectorForm(instance=item, data={'embedding': '[4, 5, 6]'}) + assert form.has_changed() + assert form.is_valid() + assert form.save() + assert np.array_equal(Item.objects.get(pk=1).embedding, [4, 5, 6]) + + def test_vector_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = VectorForm(instance=item, data={'embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).embedding is None + + def test_halfvec_form(self): + form = HalfVectorForm(data={'half_embedding': '[1, 2, 3]'}) + assert form.is_valid() + assert 'value="[1, 2, 3]"' in form.as_div() + + def test_halfvec_form_instance(self): + Item(id=1, half_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = HalfVectorForm(instance=item) + assert 'value="[1.0, 2.0, 3.0]"' in form.as_div() + + def test_halfvec_form_save(self): + Item(id=1, half_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = HalfVectorForm(instance=item, data={'half_embedding': '[4, 5, 6]'}) + assert form.has_changed() + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).half_embedding == HalfVector([4, 5, 6]) + + def test_halfvec_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = HalfVectorForm(instance=item, data={'half_embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).half_embedding is None + + def test_bit_form(self): + form = BitForm(data={'binary_embedding': '101'}) + assert form.is_valid() + assert 'value="101"' in form.as_div() + + def test_bit_form_instance(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + form = BitForm(instance=item) + assert 'value="101"' in form.as_div() + + def test_bit_form_save(self): + Item(id=1, binary_embedding='101').save() + item = Item.objects.get(pk=1) + form = BitForm(instance=item, data={'binary_embedding': '010'}) assert form.has_changed() assert form.is_valid() assert form.save() - assert [4, 5, 6] == Item.objects.get(pk=1).embedding.tolist() + assert '010' == Item.objects.get(pk=1).binary_embedding + + def test_bit_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = BitForm(instance=item, data={'binary_embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).binary_embedding is None + + def test_sparsevec_form(self): + form = SparseVectorForm(data={'sparse_embedding': '{1:1,2:2,3:3}/3'}) + assert form.is_valid() + assert 'value="{1:1,2:2,3:3}/3"' in form.as_div() + + def test_sparsevec_form_instance(self): + Item(id=1, sparse_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = SparseVectorForm(instance=item) + # TODO improve + assert 'value="{1:1.0,2:2.0,3:3.0}/3"' in form.as_div() + + def test_sparsevec_form_save(self): + Item(id=1, sparse_embedding=[1, 2, 3]).save() + item = Item.objects.get(pk=1) + form = SparseVectorForm(instance=item, data={'sparse_embedding': '{1:4,2:5,3:6}/3'}) + assert form.has_changed() + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).sparse_embedding == SparseVector([4, 5, 6]) + + def test_sparesevec_form_save_missing(self): + Item(id=1).save() + item = Item.objects.get(pk=1) + form = SparseVectorForm(instance=item, data={'sparse_embedding': ''}) + assert form.is_valid() + assert form.save() + assert Item.objects.get(pk=1).sparse_embedding is None def test_clean(self): - item = Item(id=1, embedding=[1, 2, 3]) + item = Item(id=1, embedding=[1, 2, 3], half_embedding=[1, 2, 3], binary_embedding='101', sparse_embedding=SparseVector([1, 2, 3])) item.full_clean() def test_get_or_create(self): @@ -189,3 +452,45 @@ def test_get_or_create(self): def test_missing(self): Item().save() assert Item.objects.first().embedding is None + assert Item.objects.first().half_embedding is None + assert Item.objects.first().binary_embedding is None + assert Item.objects.first().sparse_embedding is None + + def test_vector_array(self): + Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]).save() + + with connection.cursor() as cursor: + from pgvector.psycopg import register_vector + register_vector(cursor.connection) + + # this fails if the driver does not cast arrays + item = Item.objects.get(pk=1) + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) + + def test_double_array(self): + Item(id=1, double_embedding=[1, 1, 1]).save() + Item(id=2, double_embedding=[2, 2, 2]).save() + Item(id=3, double_embedding=[1, 1, 2]).save() + distance = L2Distance(Cast('double_embedding', VectorField()), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + assert items[1].double_embedding == [1, 1, 2] + + def test_numeric_array(self): + Item(id=1, numeric_embedding=[1, 1, 1]).save() + Item(id=2, numeric_embedding=[2, 2, 2]).save() + Item(id=3, numeric_embedding=[1, 1, 2]).save() + distance = L2Distance(Cast('numeric_embedding', VectorField()), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + assert items[1].numeric_embedding == [1, 1, 2] + + def test_half_precision(self): + create_items() + distance = L2Distance(Cast('embedding', HalfVectorField(dimensions=3)), [1, 1, 1]) + items = Item.objects.annotate(distance=distance).order_by(distance) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] diff --git a/tests/test_half_vector.py b/tests/test_half_vector.py new file mode 100644 index 0000000..78b4977 --- /dev/null +++ b/tests/test_half_vector.py @@ -0,0 +1,59 @@ +import numpy as np +from pgvector import HalfVector +import pytest +from struct import pack + + +class TestHalfVector: + def test_list(self): + assert HalfVector([1, 2, 3]).to_list() == [1, 2, 3] + + def test_list_str(self): + with pytest.raises(ValueError, match='could not convert string to float'): + HalfVector([1, 'two', 3]) + + def test_tuple(self): + assert HalfVector((1, 2, 3)).to_list() == [1, 2, 3] + + def test_ndarray(self): + arr = np.array([1, 2, 3]) + assert HalfVector(arr).to_list() == [1, 2, 3] + assert HalfVector(arr).to_numpy() is not arr + + def test_ndarray_same_object(self): + arr = np.array([1, 2, 3], dtype='>f2') + assert HalfVector(arr).to_list() == [1, 2, 3] + assert HalfVector(arr).to_numpy() is arr + + def test_ndim_two(self): + with pytest.raises(ValueError) as error: + HalfVector([[1, 2], [3, 4]]) + assert str(error.value) == 'expected ndim to be 1' + + def test_ndim_zero(self): + with pytest.raises(ValueError) as error: + HalfVector(1) + assert str(error.value) == 'expected ndim to be 1' + + def test_repr(self): + assert repr(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' + assert str(HalfVector([1, 2, 3])) == 'HalfVector([1.0, 2.0, 3.0])' + + def test_equality(self): + assert HalfVector([1, 2, 3]) == HalfVector([1, 2, 3]) + assert HalfVector([1, 2, 3]) != HalfVector([1, 2, 4]) + + def test_dimensions(self): + assert HalfVector([1, 2, 3]).dimensions() == 3 + + def test_from_text(self): + vec = HalfVector.from_text('[1.5,2,3]') + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) + + def test_from_binary(self): + data = pack('>HH3e', 3, 0, 1.5, 2, 3) + vec = HalfVector.from_binary(data) + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) + assert vec.to_binary() == data diff --git a/tests/test_peewee.py b/tests/test_peewee.py index 38acb7c..64fc009 100644 --- a/tests/test_peewee.py +++ b/tests/test_peewee.py @@ -1,7 +1,8 @@ from math import sqrt import numpy as np from peewee import Model, PostgresqlDatabase, fn -from pgvector.peewee import VectorField +from pgvector import HalfVector, SparseVector +from pgvector.peewee import VectorField, HalfVectorField, FixedBitField, SparseVectorField db = PostgresqlDatabase('pgvector_python_test') @@ -12,7 +13,13 @@ class Meta: class Item(BaseModel): - embedding = VectorField(dimensions=3) + embedding = VectorField(dimensions=3, null=True) + half_embedding = HalfVectorField(dimensions=3, null=True) + binary_embedding = FixedBitField(max_length=3, null=True) + sparse_embedding = SparseVectorField(dimensions=3, null=True) + + class Meta: + table_name = 'peewee_item' Item.add_index('embedding vector_l2_ops', using='hnsw') @@ -24,68 +31,194 @@ class Item(BaseModel): def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] - for i, v in enumerate(vectors): - Item.create(id=i + 1, embedding=v) + Item.create(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1])) + Item.create(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2])) + Item.create(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2])) class TestPeewee: - def setup_method(self, test_method): + def setup_method(self): Item.truncate_table() - def test_works(self): + def test_vector(self): Item.create(id=1, embedding=[1, 2, 3]) item = Item.get_by_id(1) - assert np.array_equal(item.embedding, np.array([1, 2, 3])) + assert np.array_equal(item.embedding, [1, 2, 3]) assert item.embedding.dtype == np.float32 - def test_l2_distance(self): + def test_vector_l2_distance(self): create_items() distance = Item.embedding.l2_distance([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [1, 3, 2] assert [v.distance for v in items] == [0, 1, sqrt(3)] - def test_max_inner_product(self): + def test_vector_max_inner_product(self): create_items() distance = Item.embedding.max_inner_product([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [2, 3, 1] assert [v.distance for v in items] == [-6, -4, -3] - def test_cosine_distance(self): + def test_vector_cosine_distance(self): create_items() distance = Item.embedding.cosine_distance([1, 1, 1]) items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) assert [v.id for v in items] == [1, 2, 3] assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + def test_vector_l1_distance(self): + create_items() + distance = Item.embedding.l1_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + + def test_halfvec(self): + Item.create(id=1, half_embedding=[1, 2, 3]) + item = Item.get_by_id(1) + assert item.half_embedding == HalfVector([1, 2, 3]) + + def test_halfvec_l2_distance(self): + create_items() + distance = Item.half_embedding.l2_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + + def test_halfvec_max_inner_product(self): + create_items() + distance = Item.half_embedding.max_inner_product([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_halfvec_cosine_distance(self): + create_items() + distance = Item.half_embedding.cosine_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_halfvec_l1_distance(self): + create_items() + distance = Item.half_embedding.l1_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + + def test_bit(self): + Item.create(id=1, binary_embedding='101') + item = Item.get_by_id(1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + distance = Item.binary_embedding.hamming_distance('101') + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [0, 1, 2] + + def test_bit_jaccard_distance(self): + create_items() + distance = Item.binary_embedding.jaccard_distance('101') + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + # assert [v.distance for v in items] == [0, 1/3, 1] + + def test_sparsevec(self): + Item.create(id=1, sparse_embedding=[1, 2, 3]) + item = Item.get_by_id(1) + assert item.sparse_embedding == SparseVector([1, 2, 3]) + + def test_sparsevec_l2_distance(self): + create_items() + distance = Item.sparse_embedding.l2_distance(SparseVector([1, 1, 1])) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, sqrt(3)] + + def test_sparsevec_max_inner_product(self): + create_items() + distance = Item.sparse_embedding.max_inner_product([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [2, 3, 1] + assert [v.distance for v in items] == [-6, -4, -3] + + def test_sparsevec_cosine_distance(self): + create_items() + distance = Item.sparse_embedding.cosine_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 2, 3] + assert [v.distance for v in items] == [0, 0, 0.05719095841793653] + + def test_sparsevec_l1_distance(self): + create_items() + distance = Item.sparse_embedding.l1_distance([1, 1, 1]) + items = Item.select(Item.id, distance.alias('distance')).order_by(distance).limit(5) + assert [v.id for v in items] == [1, 3, 2] + assert [v.distance for v in items] == [0, 1, 3] + def test_where(self): create_items() items = Item.select().where(Item.embedding.l2_distance([1, 1, 1]) < 1) assert [v.id for v in items] == [1] - def test_avg(self): - avg = Item.select(fn.avg(Item.embedding)).scalar() + def test_vector_avg(self): + avg = Item.select(fn.avg(Item.embedding).coerce(True)).scalar() assert avg is None Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) - avg = Item.select(fn.avg(Item.embedding)).scalar() - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + avg = Item.select(fn.avg(Item.embedding).coerce(True)).scalar() + assert np.array_equal(avg, [2.5, 3.5, 4.5]) - def test_sum(self): - sum = Item.select(fn.sum(Item.embedding)).scalar() + def test_vector_sum(self): + sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() assert sum is None Item.create(embedding=[1, 2, 3]) Item.create(embedding=[4, 5, 6]) - sum = Item.select(fn.sum(Item.embedding)).scalar() - assert np.array_equal(sum, np.array([5, 7, 9])) + sum = Item.select(fn.sum(Item.embedding).coerce(True)).scalar() + assert np.array_equal(sum, [5, 7, 9]) + + def test_halfvec_avg(self): + avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() + assert avg is None + Item.create(half_embedding=[1, 2, 3]) + Item.create(half_embedding=[4, 5, 6]) + avg = Item.select(fn.avg(Item.half_embedding).coerce(True)).scalar() + assert avg == HalfVector([2.5, 3.5, 4.5]) + + def test_halfvec_sum(self): + sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() + assert sum is None + Item.create(half_embedding=[1, 2, 3]) + Item.create(half_embedding=[4, 5, 6]) + sum = Item.select(fn.sum(Item.half_embedding).coerce(True)).scalar() + assert sum == HalfVector([5, 7, 9]) def test_get_or_create(self): Item.get_or_create(id=1, defaults={'embedding': [1, 2, 3]}) Item.get_or_create(embedding=np.array([4, 5, 6])) Item.get_or_create(embedding=Item.embedding.to_value([7, 8, 9])) + + def test_vector_array(self): + from playhouse.postgres_ext import PostgresqlExtDatabase, ArrayField + + ext_db = PostgresqlExtDatabase('pgvector_python_test') + + class ExtItem(BaseModel): + embeddings = ArrayField(VectorField, field_kwargs={'dimensions': 3}, index=False) + + class Meta: + database = ext_db + table_name = 'peewee_ext_item' + + ext_db.connect() + ext_db.drop_tables([ExtItem]) + ext_db.create_tables([ExtItem]) + + # fails with column "embeddings" is of type vector[] but expression is of type text[] + # ExtItem.create(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])]) + # item = ExtItem.get_by_id(1) + # assert np.array_equal(item.embeddings[0], [1, 2, 3]) + # assert np.array_equal(item.embeddings[1], [4, 5, 6]) diff --git a/tests/test_pg8000.py b/tests/test_pg8000.py new file mode 100644 index 0000000..4d3e474 --- /dev/null +++ b/tests/test_pg8000.py @@ -0,0 +1,60 @@ +import numpy as np +import os +from pgvector import HalfVector, SparseVector, Vector +from pgvector.pg8000 import register_vector +from pg8000.native import Connection + +conn = Connection(os.environ["USER"], database='pgvector_python_test') + +conn.run('CREATE EXTENSION IF NOT EXISTS vector') +conn.run('DROP TABLE IF EXISTS pg8000_items') +conn.run('CREATE TABLE pg8000_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3))') + +register_vector(conn) + + +class TestPg8000: + def setup_method(self): + conn.run('DELETE FROM pg8000_items') + + def test_vector(self): + embedding = np.array([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT embedding FROM pg8000_items ORDER BY id') + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_vector_class(self): + embedding = Vector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT embedding FROM pg8000_items ORDER BY id') + assert np.array_equal(res[0][0], embedding.to_numpy()) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_halfvec(self): + embedding = HalfVector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (half_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT half_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == embedding + assert res[1][0] is None + + def test_bit(self): + embedding = '101' + conn.run('INSERT INTO pg8000_items (binary_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT binary_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == '101' + assert res[1][0] is None + + def test_sparsevec(self): + embedding = SparseVector([1.5, 2, 3]) + conn.run('INSERT INTO pg8000_items (sparse_embedding) VALUES (:embedding), (NULL)', embedding=embedding) + + res = conn.run('SELECT sparse_embedding FROM pg8000_items ORDER BY id') + assert res[0][0] == embedding + assert res[1][0] is None diff --git a/tests/test_psycopg.py b/tests/test_psycopg.py index 73c54d9..698b34f 100644 --- a/tests/test_psycopg.py +++ b/tests/test_psycopg.py @@ -1,92 +1,223 @@ import numpy as np +from pgvector import Bit, HalfVector, SparseVector, Vector from pgvector.psycopg import register_vector, register_vector_async import psycopg +from psycopg_pool import ConnectionPool, AsyncConnectionPool import pytest conn = psycopg.connect(dbname='pgvector_python_test', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') -conn.execute('DROP TABLE IF EXISTS item') -conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') +conn.execute('DROP TABLE IF EXISTS psycopg_items') +conn.execute('CREATE TABLE psycopg_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[])') register_vector(conn) class TestPsycopg: - def setup_method(self, test_method): - conn.execute('DELETE FROM item') + def setup_method(self): + conn.execute('DELETE FROM psycopg_items') - def test_works(self): + def test_vector(self): embedding = np.array([1.5, 2, 3]) - conn.execute('INSERT INTO item (embedding) VALUES (%s), (NULL)', (embedding,)) + conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s), (NULL)', (embedding,)) - res = conn.execute('SELECT * FROM item ORDER BY id').fetchall() - assert np.array_equal(res[0][1], embedding) - assert res[0][1].dtype == np.float32 - assert res[1][1] is None + res = conn.execute('SELECT embedding FROM psycopg_items ORDER BY id').fetchall() + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None - def test_binary_format(self): + def test_vector_binary_format(self): embedding = np.array([1.5, 2, 3]) - res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] + res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] assert np.array_equal(res, embedding) - def test_text_format(self): + def test_vector_text_format(self): embedding = np.array([1.5, 2, 3]) res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] assert np.array_equal(res, embedding) - def test_binary_format_correct(self): + def test_vector_binary_format_correct(self): embedding = np.array([1.5, 2, 3]) res = conn.execute('SELECT %b::vector::text', (embedding,)).fetchone()[0] assert res == '[1.5,2,3]' - def test_text_format_non_contiguous(self): + def test_vector_text_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([3, 2, 1.5])) + assert np.array_equal(res, [3, 2, 1.5]) - def test_binary_format_non_contiguous(self): + def test_vector_binary_format_non_contiguous(self): embedding = np.flipud(np.array([1.5, 2, 3])) assert not embedding.data.contiguous res = conn.execute('SELECT %b::vector', (embedding,)).fetchone()[0] - assert np.array_equal(res, np.array([3, 2, 1.5])) + assert np.array_equal(res, [3, 2, 1.5]) + + def test_vector_class_binary_format(self): + embedding = Vector([1.5, 2, 3]) + res = conn.execute('SELECT %b::vector', (embedding,), binary=True).fetchone()[0] + assert np.array_equal(res, [1.5, 2, 3]) + + def test_vector_class_text_format(self): + embedding = Vector([1.5, 2, 3]) + res = conn.execute('SELECT %t::vector', (embedding,)).fetchone()[0] + assert np.array_equal(res, [1.5, 2, 3]) + + def test_halfvec(self): + embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (half_embedding) VALUES (%s)', (embedding,)) + + res = conn.execute('SELECT half_embedding FROM psycopg_items ORDER BY id').fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) + + def test_halfvec_binary_format(self): + embedding = HalfVector([1.5, 2, 3]) + res = conn.execute('SELECT %b::halfvec', (embedding,), binary=True).fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) + + def test_halfvec_text_format(self): + embedding = HalfVector([1.5, 2, 3]) + res = conn.execute('SELECT %t::halfvec', (embedding,)).fetchone()[0] + assert res == HalfVector([1.5, 2, 3]) + + def test_bit(self): + embedding = Bit([True, False, True]) + conn.execute('INSERT INTO psycopg_items (binary_embedding) VALUES (%s)', (embedding,)) + + res = conn.execute('SELECT binary_embedding FROM psycopg_items ORDER BY id').fetchone()[0] + assert res == '101' + + def test_bit_binary_format(self): + embedding = Bit([False, True, False, True, False, False, False, False, True]) + res = conn.execute('SELECT %b::bit(9)', (embedding,), binary=True).fetchone()[0] + assert repr(Bit.from_binary(res)) == 'Bit(010100001)' + + def test_bit_text_format(self): + embedding = Bit([False, True, False, True, False, False, False, False, True]) + res = conn.execute('SELECT %t::bit(9)', (embedding,)).fetchone()[0] + assert res == '010100001' + assert repr(Bit(res)) == 'Bit(010100001)' + + def test_sparsevec(self): + embedding = SparseVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (sparse_embedding) VALUES (%s)', (embedding,)) + + res = conn.execute('SELECT sparse_embedding FROM psycopg_items ORDER BY id').fetchone()[0] + assert res == SparseVector([1.5, 2, 3]) + + def test_sparsevec_binary_format(self): + embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) + res = conn.execute('SELECT %b::sparsevec', (embedding,), binary=True).fetchone()[0] + assert res == embedding + + def test_sparsevec_text_format(self): + embedding = SparseVector([1.5, 0, 2, 0, 3, 0]) + res = conn.execute('SELECT %t::sparsevec', (embedding,)).fetchone()[0] + assert res == embedding + + def test_text_copy_from(self): + embedding = np.array([1.5, 2, 3]) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN") as copy: + copy.write_row([embedding, HalfVector(embedding), '101', SparseVector(embedding)]) - def test_text_copy(self): + def test_binary_copy_from(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY item (embedding) FROM STDIN") as copy: - copy.write_row([embedding]) + with cur.copy("COPY psycopg_items (embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.write_row([embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) - def test_binary_copy(self): + def test_binary_copy_from_set_types(self): embedding = np.array([1.5, 2, 3]) cur = conn.cursor() - with cur.copy("COPY item (embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.write_row([embedding]) + with cur.copy("COPY psycopg_items (id, embedding, half_embedding, binary_embedding, sparse_embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(['int8', 'vector', 'halfvec', 'bit', 'sparsevec']) + copy.write_row([1, embedding, HalfVector(embedding), Bit('101'), SparseVector(embedding)]) - def test_binary_copy_set_types(self): + def test_text_copy_to(self): embedding = np.array([1.5, 2, 3]) + half_embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding, half_embedding) VALUES (%s, %s)', (embedding, half_embedding)) cur = conn.cursor() - with cur.copy("COPY item (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: - copy.set_types(['int8', 'vector']) - copy.write_row([1, embedding]) + with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT") as copy: + for row in copy.rows(): + assert row[0] == "[1.5,2,3]" + assert row[1] == "[1.5,2,3]" + + def test_binary_copy_to(self): + embedding = np.array([1.5, 2, 3]) + half_embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding, half_embedding) VALUES (%s, %s)', (embedding, half_embedding)) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: + for row in copy.rows(): + assert np.array_equal(Vector.from_binary(row[0]).to_numpy(), embedding) + assert HalfVector.from_binary(row[1]) == half_embedding + + def test_binary_copy_to_set_types(self): + embedding = np.array([1.5, 2, 3]) + half_embedding = HalfVector([1.5, 2, 3]) + conn.execute('INSERT INTO psycopg_items (embedding, half_embedding) VALUES (%s, %s)', (embedding, half_embedding)) + cur = conn.cursor() + with cur.copy("COPY psycopg_items (embedding, half_embedding) TO STDOUT WITH (FORMAT BINARY)") as copy: + copy.set_types(['vector', 'halfvec']) + for row in copy.rows(): + assert np.array_equal(row[0], embedding) + assert row[1] == half_embedding + + def test_vector_array(self): + embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + conn.execute('INSERT INTO psycopg_items (embeddings) VALUES (%s)', (embeddings,)) + + res = conn.execute('SELECT embeddings FROM psycopg_items ORDER BY id').fetchone() + assert np.array_equal(res[0][0], embeddings[0]) + assert np.array_equal(res[0][1], embeddings[1]) + + def test_pool(self): + def configure(conn): + register_vector(conn) + + pool = ConnectionPool(conninfo='postgres://localhost/pgvector_python_test', open=True, configure=configure) + + with pool.connection() as conn: + res = conn.execute("SELECT '[1,2,3]'::vector").fetchone() + assert np.array_equal(res[0], [1, 2, 3]) + + pool.close() @pytest.mark.asyncio async def test_async(self): conn = await psycopg.AsyncConnection.connect(dbname='pgvector_python_test', autocommit=True) await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') - await conn.execute('DROP TABLE IF EXISTS item') - await conn.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') + await conn.execute('DROP TABLE IF EXISTS psycopg_items') + await conn.execute('CREATE TABLE psycopg_items (id bigserial PRIMARY KEY, embedding vector(3))') await register_vector_async(conn) embedding = np.array([1.5, 2, 3]) - await conn.execute('INSERT INTO item (embedding) VALUES (%s), (NULL)', (embedding,)) + await conn.execute('INSERT INTO psycopg_items (embedding) VALUES (%s), (NULL)', (embedding,)) async with conn.cursor() as cur: - await cur.execute('SELECT * FROM item ORDER BY id') + await cur.execute('SELECT * FROM psycopg_items ORDER BY id') res = await cur.fetchall() assert np.array_equal(res[0][1], embedding) assert res[0][1].dtype == np.float32 assert res[1][1] is None + + @pytest.mark.asyncio + async def test_async_pool(self): + async def configure(conn): + await register_vector_async(conn) + + pool = AsyncConnectionPool(conninfo='postgres://localhost/pgvector_python_test', open=False, configure=configure) + await pool.open() + + async with pool.connection() as conn: + async with conn.cursor() as cur: + await cur.execute("SELECT '[1,2,3]'::vector") + res = await cur.fetchone() + assert np.array_equal(res[0], [1, 2, 3]) + + await pool.close() diff --git a/tests/test_psycopg2.py b/tests/test_psycopg2.py index ac899ed..7f4932d 100644 --- a/tests/test_psycopg2.py +++ b/tests/test_psycopg2.py @@ -1,28 +1,136 @@ import numpy as np +from pgvector import HalfVector, SparseVector, Vector from pgvector.psycopg2 import register_vector import psycopg2 +from psycopg2.extras import DictCursor, RealDictCursor, NamedTupleCursor +from psycopg2.pool import ThreadedConnectionPool conn = psycopg2.connect(dbname='pgvector_python_test') conn.autocommit = True cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') -cur.execute('DROP TABLE IF EXISTS item') -cur.execute('CREATE TABLE item (id bigserial primary key, embedding vector(3))') +cur.execute('DROP TABLE IF EXISTS psycopg2_items') +cur.execute('CREATE TABLE psycopg2_items (id bigserial PRIMARY KEY, embedding vector(3), half_embedding halfvec(3), binary_embedding bit(3), sparse_embedding sparsevec(3), embeddings vector[], half_embeddings halfvec[], sparse_embeddings sparsevec[])') register_vector(cur) class TestPsycopg2: - def setup_method(self, test_method): - cur.execute('DELETE FROM item') + def setup_method(self): + cur.execute('DELETE FROM psycopg2_items') - def test_works(self): + def test_vector(self): embedding = np.array([1.5, 2, 3]) - cur.execute('INSERT INTO item (embedding) VALUES (%s), (NULL)', (embedding,)) + cur.execute('INSERT INTO psycopg2_items (embedding) VALUES (%s), (NULL)', (embedding,)) - cur.execute('SELECT * FROM item ORDER BY id') + cur.execute('SELECT embedding FROM psycopg2_items ORDER BY id') res = cur.fetchall() - assert np.array_equal(res[0][1], embedding) - assert res[0][1].dtype == np.float32 - assert res[1][1] is None + assert np.array_equal(res[0][0], embedding) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_vector_class(self): + embedding = Vector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert np.array_equal(res[0][0], embedding.to_numpy()) + assert res[0][0].dtype == np.float32 + assert res[1][0] is None + + def test_halfvec(self): + embedding = [1.5, 2, 3] + cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == HalfVector([1.5, 2, 3]) + assert res[1][0] is None + + def test_halfvec_class(self): + embedding = HalfVector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (half_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT half_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == embedding + assert res[1][0] is None + + def test_bit(self): + embedding = '101' + cur.execute('INSERT INTO psycopg2_items (binary_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT binary_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == '101' + assert res[1][0] is None + + def test_sparsevec(self): + embedding = SparseVector([1.5, 2, 3]) + cur.execute('INSERT INTO psycopg2_items (sparse_embedding) VALUES (%s), (NULL)', (embedding,)) + + cur.execute('SELECT sparse_embedding FROM psycopg2_items ORDER BY id') + res = cur.fetchall() + assert res[0][0] == SparseVector([1.5, 2, 3]) + assert res[1][0] is None + + def test_vector_array(self): + embeddings = [np.array([1.5, 2, 3]), np.array([4.5, 5, 6])] + cur.execute('INSERT INTO psycopg2_items (embeddings) VALUES (%s::vector[])', (embeddings,)) + + cur.execute('SELECT embeddings FROM psycopg2_items ORDER BY id') + res = cur.fetchone() + assert np.array_equal(res[0][0], embeddings[0]) + assert np.array_equal(res[0][1], embeddings[1]) + + def test_halfvec_array(self): + embeddings = [HalfVector([1.5, 2, 3]), HalfVector([4.5, 5, 6])] + cur.execute('INSERT INTO psycopg2_items (half_embeddings) VALUES (%s::halfvec[])', (embeddings,)) + + cur.execute('SELECT half_embeddings FROM psycopg2_items ORDER BY id') + res = cur.fetchone() + assert res[0] == [HalfVector([1.5, 2, 3]), HalfVector([4.5, 5, 6])] + + def test_sparsevec_array(self): + embeddings = [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] + cur.execute('INSERT INTO psycopg2_items (sparse_embeddings) VALUES (%s::sparsevec[])', (embeddings,)) + + cur.execute('SELECT sparse_embeddings FROM psycopg2_items ORDER BY id') + res = cur.fetchone() + assert res[0] == [SparseVector([1.5, 2, 3]), SparseVector([4.5, 5, 6])] + + def test_cursor_factory(self): + for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: + conn = psycopg2.connect(dbname='pgvector_python_test') + cur = conn.cursor(cursor_factory=cursor_factory) + register_vector(cur) + conn.close() + + def test_cursor_factory_connection(self): + for cursor_factory in [DictCursor, RealDictCursor, NamedTupleCursor]: + conn = psycopg2.connect(dbname='pgvector_python_test', cursor_factory=cursor_factory) + register_vector(conn) + conn.close() + + def test_pool(self): + pool = ThreadedConnectionPool(1, 1, dbname='pgvector_python_test') + + conn = pool.getconn() + try: + # use globally=True for apps to ensure registered with all connections + register_vector(conn) + finally: + pool.putconn(conn) + + conn = pool.getconn() + try: + cur = conn.cursor() + cur.execute("SELECT '[1,2,3]'::vector") + res = cur.fetchone() + assert np.array_equal(res[0], [1, 2, 3]) + finally: + pool.putconn(conn) + + pool.closeall() diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py new file mode 100644 index 0000000..d580f32 --- /dev/null +++ b/tests/test_sparse_vector.py @@ -0,0 +1,112 @@ +import numpy as np +from pgvector import SparseVector +import pytest +from scipy.sparse import coo_array, coo_matrix, csr_array, csr_matrix +from struct import pack + + +class TestSparseVector: + def test_list(self): + vec = SparseVector([1, 0, 2, 0, 3, 0]) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), [1, 0, 2, 0, 3, 0]) + assert vec.indices() == [0, 2, 4] + + def test_list_dimensions(self): + with pytest.raises(ValueError) as error: + SparseVector([1, 0, 2, 0, 3, 0], 6) + assert str(error.value) == 'extra argument' + + def test_ndarray(self): + vec = SparseVector(np.array([1, 0, 2, 0, 3, 0])) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_dict(self): + vec = SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_dict_no_dimensions(self): + with pytest.raises(ValueError) as error: + SparseVector({0: 1, 2: 2, 4: 3}) + assert str(error.value) == 'missing dimensions' + + def test_coo_array(self): + arr = coo_array(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_coo_array_dimensions(self): + with pytest.raises(ValueError) as error: + SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) + assert str(error.value) == 'extra argument' + + def test_coo_matrix(self): + mat = coo_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_dok_array(self): + arr = coo_array(np.array([1, 0, 2, 0, 3, 0])).todok() + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_csr_array(self): + arr = csr_array(np.array([[1, 0, 2, 0, 3, 0]])) + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_csr_matrix(self): + mat = csr_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_repr(self): + assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' + assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' + + def test_equality(self): + assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector([1, 0, 2, 0, 3, 0]) + assert SparseVector([1, 0, 2, 0, 3, 0]) != SparseVector([1, 0, 2, 0, 3, 1]) + assert SparseVector([1, 0, 2, 0, 3, 0]) == SparseVector({2: 2, 4: 3, 0: 1, 3: 0}, 6) + assert SparseVector({}, 1) != SparseVector({}, 2) + + def test_dimensions(self): + assert SparseVector([1, 0, 2, 0, 3, 0]).dimensions() == 6 + + def test_indices(self): + assert SparseVector([1, 0, 2, 0, 3, 0]).indices() == [0, 2, 4] + + def test_values(self): + assert SparseVector([1, 0, 2, 0, 3, 0]).values() == [1, 2, 3] + + def test_to_coo(self): + assert np.array_equal(SparseVector([1, 0, 2, 0, 3, 0]).to_coo().toarray(), [[1, 0, 2, 0, 3, 0]]) + + def test_zero_vector_text(self): + vec = SparseVector({}, 3) + assert vec.to_list() == SparseVector.from_text(vec.to_text()).to_list() + + def test_from_text(self): + vec = SparseVector.from_text('{1:1.5,3:2,5:3}/6') + assert vec.dimensions() == 6 + assert vec.indices() == [0, 2, 4] + assert vec.values() == [1.5, 2, 3] + assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), [1.5, 0, 2, 0, 3, 0]) + + def test_from_binary(self): + data = pack('>iii3i3f', 6, 3, 0, 0, 2, 4, 1.5, 2, 3) + vec = SparseVector.from_binary(data) + assert vec.dimensions() == 6 + assert vec.indices() == [0, 2, 4] + assert vec.values() == [1.5, 2, 3] + assert vec.to_list() == [1.5, 0, 2, 0, 3, 0] + assert np.array_equal(vec.to_numpy(), [1.5, 0, 2, 0, 3, 0]) + assert vec.to_binary() == data diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 21b46ac..5aec977 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,13 +1,74 @@ +import asyncpg import numpy as np -from pgvector.sqlalchemy import Vector +import os +from pgvector import HalfVector, SparseVector, Vector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest -from sqlalchemy import create_engine, select, text, MetaData, Table, Column, Index, Integer +from sqlalchemy import create_engine, event, insert, inspect, select, text, MetaData, Table, Column, Index, Integer, ARRAY from sqlalchemy.exc import StatementError -from sqlalchemy.orm import declarative_base, mapped_column, Session +from sqlalchemy.ext.automap import automap_base +from sqlalchemy.orm import declarative_base, Session from sqlalchemy.sql import func -engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') -with Session(engine) as session: +try: + from sqlalchemy.orm import mapped_column + from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + sqlalchemy_version = 2 +except ImportError: + mapped_column = Column + sqlalchemy_version = 1 + +psycopg2_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +psycopg2_type_engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') + + +@event.listens_for(psycopg2_type_engine, "connect") +def psycopg2_connect(dbapi_connection, connection_record): + from pgvector.psycopg2 import register_vector + register_vector(dbapi_connection) + + +pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') + +if sqlalchemy_version > 1: + psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + psycopg_type_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') + + @event.listens_for(psycopg_type_engine, "connect") + def psycopg_connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector + register_vector(dbapi_connection) + + psycopg_async_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') + + @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") + def psycopg_async_connect(dbapi_connection, connection_record): + from pgvector.psycopg import register_vector_async + dbapi_connection.run_async(register_vector_async) + + asyncpg_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') + + @event.listens_for(asyncpg_type_engine.sync_engine, "connect") + def asyncpg_connect(dbapi_connection, connection_record): + from pgvector.asyncpg import register_vector + dbapi_connection.run_async(register_vector) + +engines = [psycopg2_engine, psycopg2_type_engine, pg8000_engine] +array_engines = [psycopg2_type_engine] +async_engines = [] +async_array_engines = [] + +if sqlalchemy_version > 1: + engines += [psycopg_engine, psycopg_type_engine] + array_engines += [psycopg_type_engine] + # TODO support asyncpg_type_engine + async_engines += [psycopg_async_engine, psycopg_async_type_engine, asyncpg_engine] + async_array_engines += [psycopg_async_type_engine, asyncpg_engine] + +setup_engine = engines[0] +with Session(setup_engine) as session: session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) session.commit() @@ -15,49 +76,86 @@ class Item(Base): - __tablename__ = 'orm_item' + __tablename__ = 'sqlalchemy_orm_item' id = mapped_column(Integer, primary_key=True) - embedding = mapped_column(Vector(3)) + embedding = mapped_column(VECTOR(3)) + half_embedding = mapped_column(HALFVEC(3)) + binary_embedding = mapped_column(BIT(3)) + sparse_embedding = mapped_column(SPARSEVEC(3)) + embeddings = mapped_column(ARRAY(VECTOR(3))) + half_embeddings = mapped_column(ARRAY(HALFVEC(3))) + + +Base.metadata.drop_all(setup_engine) +Base.metadata.create_all(setup_engine) + +index = Index( + 'sqlalchemy_orm_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +index.create(setup_engine) + +half_precision_index = Index( + 'sqlalchemy_orm_half_precision_index', + func.cast(Item.embedding, HALFVEC(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'halfvec_l2_ops'} +) +half_precision_index.create(setup_engine) + +binary_quantize_index = Index( + 'sqlalchemy_orm_binary_quantize_index', + func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'bit_hamming_ops'} +) +binary_quantize_index.create(setup_engine) -Base.metadata.drop_all(engine) -Base.metadata.create_all(engine) +def create_items(): + with Session(setup_engine) as session: + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) + session.commit() -def create_items(): - vectors = [ - [1, 1, 1], - [2, 2, 2], - [1, 1, 2] - ] - session = Session(engine) - for i, v in enumerate(vectors): - session.add(Item(id=i + 1, embedding=v)) - session.commit() +def delete_items(): + with Session(setup_engine) as session: + session.query(Item).delete() + session.commit() +@pytest.mark.parametrize('engine', engines) class TestSqlalchemy: - def setup_method(self, test_method): - with Session(engine) as session: - session.query(Item).delete() - session.commit() + def setup_method(self): + delete_items() - def test_core(self): + def test_core(self, engine): metadata = MetaData() item_table = Table( - 'core_item', + 'sqlalchemy_core_item', metadata, Column('id', Integer, primary_key=True), - Column('embedding', Vector(3)) + Column('embedding', VECTOR(3)), + Column('half_embedding', HALFVEC(3)), + Column('binary_embedding', BIT(3)), + Column('sparse_embedding', SPARSEVEC(3)), + Column('embeddings', ARRAY(VECTOR(3))) ) metadata.drop_all(engine) metadata.create_all(engine) ivfflat_index = Index( - 'ivfflat_core_index', + 'sqlalchemy_core_ivfflat_index', item_table.c.embedding, postgresql_using='ivfflat', postgresql_with={'lists': 1}, @@ -66,7 +164,7 @@ def test_core(self): ivfflat_index.create(engine) hnsw_index = Index( - 'hnsw_core_index', + 'sqlalchemy_core_hnsw_index', item_table.c.embedding, postgresql_using='hnsw', postgresql_with={'m': 16, 'ef_construction': 64}, @@ -74,124 +172,478 @@ def test_core(self): ) hnsw_index.create(engine) - def test_orm(self): + def test_orm(self, engine): item = Item(embedding=np.array([1.5, 2, 3])) item2 = Item(embedding=[4, 5, 6]) item3 = Item() - session = Session(engine) - session.add(item) - session.add(item2) - session.add(item3) - session.commit() + with Session(engine) as session: + session.add(item) + session.add(item2) + session.add(item3) + session.commit() stmt = select(Item) with Session(engine) as session: items = [v[0] for v in session.execute(stmt).all()] - assert items[0].id == 1 - assert items[1].id == 2 - assert items[2].id == 3 + # TODO improve + assert items[0].id % 3 == 1 + assert items[1].id % 3 == 2 + assert items[2].id % 3 == 0 assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) assert items[0].embedding.dtype == np.float32 assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) assert items[1].embedding.dtype == np.float32 assert items[2].embedding is None - def test_l2_distance(self): + def test_vector(self, engine): + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert np.array_equal(item.embedding, [1, 2, 3]) + + def test_vector_l2_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.l2_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 3, 2] - def test_l2_distance_orm(self): + def test_vector_l2_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] - def test_max_inner_product(self): + def test_vector_max_inner_product(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.max_inner_product([1, 1, 1])).all() assert [v.id for v in items] == [2, 3, 1] - def test_max_inner_product_orm(self): + def test_vector_max_inner_product_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) assert [v.id for v in items] == [2, 3, 1] - def test_cosine_distance(self): + def test_vector_cosine_distance(self, engine): create_items() with Session(engine) as session: items = session.query(Item).order_by(Item.embedding.cosine_distance([1, 1, 1])).all() assert [v.id for v in items] == [1, 2, 3] - def test_cosine_distance_orm(self): + def test_vector_cosine_distance_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 2, 3] - def test_filter(self): + def test_vector_l1_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.embedding.l1_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_vector_l1_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec(self, engine): + with Session(engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding == HalfVector([1, 2, 3]) + + def test_halfvec_l2_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec_l2_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec_max_inner_product(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1])).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_halfvec_max_inner_product_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_halfvec_cosine_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 2, 3] + + def test_halfvec_cosine_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_halfvec_l1_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec_l1_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_bit(self, engine): + with Session(engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.binary_embedding.hamming_distance('101')).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_hamming_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance(self, engine): + if engine == pg8000_engine: + return + + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.binary_embedding.jaccard_distance('101')).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance_orm(self, engine): + if engine == pg8000_engine: + return + + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec(self, engine): + with Session(engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding == SparseVector([1, 2, 3]) + + def test_sparsevec_l2_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_l2_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_max_inner_product(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1])).all() + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec_max_inner_product_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec_cosine_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 2, 3] + + def test_sparsevec_cosine_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_sparsevec_l1_distance(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_l1_distance_orm(self, engine): + create_items() + with Session(engine) as session: + items = session.scalars(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_filter(self, engine): create_items() with Session(engine) as session: items = session.query(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1).all() assert [v.id for v in items] == [1] - def test_filter_orm(self): + def test_filter_orm(self, engine): create_items() with Session(engine) as session: items = session.scalars(select(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1)) assert [v.id for v in items] == [1] - def test_select(self): + def test_select(self, engine): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) - item = session.query(Item.embedding.l2_distance([1, 1, 1])).first() - assert item[0] == 3 + items = session.query(Item.embedding.l2_distance([1, 1, 1])).first() + assert items[0] == 3 - def test_select_orm(self): + def test_select_orm(self, engine): with Session(engine) as session: session.add(Item(embedding=[2, 3, 3])) - item = session.scalars(select(Item.embedding.l2_distance([1, 1, 1]))).all() - assert item[0] == 3 + items = session.scalars(select(Item.embedding.l2_distance([1, 1, 1]))).all() + assert items[0] == 3 + + def test_avg(self, engine): + with Session(engine) as session: + res = session.query(avg(Item.embedding)).first()[0] + assert res is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + res = session.query(avg(Item.embedding)).first()[0] + assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) + + def test_avg_orm(self, engine): + with Session(engine) as session: + res = session.scalars(select(avg(Item.embedding))).first() + assert res is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + res = session.scalars(select(avg(Item.embedding))).first() + assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) - def test_avg(self): + def test_sum(self, engine): with Session(engine) as session: - avg = session.query(func.avg(Item.embedding)).first()[0] - assert avg is None + res = session.query(sum(Item.embedding)).first()[0] + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - avg = session.query(func.avg(Item.embedding)).first()[0] - assert np.array_equal(avg, np.array([2.5, 3.5, 4.5])) + res = session.query(sum(Item.embedding)).first()[0] + assert np.array_equal(res, np.array([5, 7, 9])) - def test_sum(self): + def test_sum_orm(self, engine): with Session(engine) as session: - sum = session.query(func.sum(Item.embedding)).first()[0] - assert sum is None + res = session.scalars(select(sum(Item.embedding))).first() + assert res is None session.add(Item(embedding=[1, 2, 3])) session.add(Item(embedding=[4, 5, 6])) - sum = session.query(func.sum(Item.embedding)).first()[0] - assert np.array_equal(sum, np.array([5, 7, 9])) + res = session.scalars(select(sum(Item.embedding))).first() + assert np.array_equal(res, np.array([5, 7, 9])) - def test_bad_dimensions(self): + def test_bad_dimensions(self, engine): item = Item(embedding=[1, 2]) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() - def test_bad_ndim(self): + def test_bad_ndim(self, engine): item = Item(embedding=np.array([[1, 2, 3]])) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='expected ndim to be 1'): - session.commit() + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected ndim to be 1'): + session.commit() - def test_bad_dtype(self): + def test_bad_dtype(self, engine): item = Item(embedding=np.array(['one', 'two', 'three'])) - session = Session(engine) - session.add(item) - with pytest.raises(StatementError, match='dtype must be numeric'): + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='could not convert string to float'): + session.commit() + + def test_inspect(self, engine): + columns = inspect(engine).get_columns('sqlalchemy_orm_item') + assert isinstance(columns[1]['type'], VECTOR) + + def test_literal_binds(self, engine): + sql = select(Item).order_by(Item.embedding.l2_distance([1, 2, 3])).compile(engine, compile_kwargs={'literal_binds': True}) + assert "embedding <-> '[1.0,2.0,3.0]'" in str(sql) + + def test_insert(self, engine): + with Session(engine) as session: + session.execute(insert(Item).values(embedding=np.array([1, 2, 3]))) + + def test_insert_bulk(self, engine): + with Session(engine) as session: + session.execute(insert(Item), [{'embedding': np.array([1, 2, 3])}]) + + # register_vector in psycopg2 tests change this behavior + # def test_insert_text(self): + # with Session(engine) as session: + # session.execute(text('INSERT INTO sqlalchemy_orm_item (embedding) VALUES (:embedding)'), {'embedding': np.array([1, 2, 3])}) + + def test_automap(self, engine): + metadata = MetaData() + metadata.reflect(engine, only=['sqlalchemy_orm_item']) + AutoBase = automap_base(metadata=metadata) + AutoBase.prepare() + AutoItem = AutoBase.classes.sqlalchemy_orm_item + with Session(engine) as session: + session.execute(insert(AutoItem), [{'embedding': np.array([1, 2, 3])}]) + item = session.query(AutoItem).first() + assert np.array_equal(item.embedding, [1, 2, 3]) + + def test_half_precision(self, engine): + create_items() + with Session(engine) as session: + items = session.query(Item).order_by(func.cast(Item.embedding, HALFVEC(3)).l2_distance([1, 1, 1])).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_binary_quantize(self, engine): + with Session(engine) as session: + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) session.commit() + + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) + items = session.query(Item).order_by(distance).all() + assert [v.id for v in items] == [2, 3, 1] + + +@pytest.mark.parametrize('engine', array_engines) +class TestSqlalchemyArray: + def setup_method(self): + delete_items() + + def test_vector_array(self, engine): + with Session(engine) as session: + session.add(Item(id=1, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() + + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) + + def test_halfvec_array(self, engine): + with Session(engine) as session: + session.add(Item(id=1, half_embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + session.commit() + + # this fails if the driver does not cast arrays + item = session.get(Item, 1) + assert item.half_embeddings == [HalfVector([1, 2, 3]), HalfVector([4, 5, 6])] + + +@pytest.mark.parametrize('engine', async_engines) +class TestSqlalchemyAsync: + def setup_method(self): + delete_items() + + @pytest.mark.asyncio + async def test_vector(self, engine): + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + embedding = np.array([1, 2, 3]) + session.add(Item(id=1, embedding=embedding)) + item = await session.get(Item, 1) + assert np.array_equal(item.embedding, embedding) + + await engine.dispose() + + @pytest.mark.asyncio + async def test_halfvec(self, engine): + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + embedding = [1, 2, 3] + session.add(Item(id=1, half_embedding=embedding)) + item = await session.get(Item, 1) + assert item.half_embedding == HalfVector(embedding) + + await engine.dispose() + + @pytest.mark.asyncio + async def test_bit(self, engine): + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + embedding = asyncpg.BitString('101') if engine == asyncpg_engine else '101' + session.add(Item(id=1, binary_embedding=embedding)) + item = await session.get(Item, 1) + assert item.binary_embedding == embedding + + await engine.dispose() + + @pytest.mark.asyncio + async def test_sparsevec(self, engine): + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + embedding = [1, 2, 3] + session.add(Item(id=1, sparse_embedding=embedding)) + item = await session.get(Item, 1) + assert item.sparse_embedding == SparseVector(embedding) + + await engine.dispose() + + @pytest.mark.asyncio + async def test_avg(self, engine): + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + res = await session.scalars(select(avg(Item.embedding))) + assert np.array_equal(res.first(), [2.5, 3.5, 4.5]) + + await engine.dispose() + + +@pytest.mark.parametrize('engine', async_array_engines) +class TestSqlalchemyAsyncArray: + def setup_method(self): + delete_items() + + @pytest.mark.asyncio + async def test_vector_array(self, engine): + async_session = async_sessionmaker(engine, expire_on_commit=False) + + async with async_session() as session: + async with session.begin(): + session.add(Item(id=1, embeddings=[Vector([1, 2, 3]), Vector([4, 5, 6])])) + item = await session.get(Item, 1) + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) + + session.add(Item(id=2, embeddings=[np.array([1, 2, 3]), np.array([4, 5, 6])])) + item = await session.get(Item, 2) + assert np.array_equal(item.embeddings[0], [1, 2, 3]) + assert np.array_equal(item.embeddings[1], [4, 5, 6]) + + await engine.dispose() diff --git a/tests/test_sqlmodel.py b/tests/test_sqlmodel.py new file mode 100644 index 0000000..f4994f4 --- /dev/null +++ b/tests/test_sqlmodel.py @@ -0,0 +1,239 @@ +import numpy as np +from pgvector import HalfVector, SparseVector +from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum +import pytest +from sqlalchemy.exc import StatementError +from sqlmodel import Field, Index, Session, SQLModel, create_engine, delete, select, text +from typing import Any, Optional + +engine = create_engine('postgresql+psycopg2://localhost/pgvector_python_test') +with Session(engine) as session: + session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) + + +class Item(SQLModel, table=True): + __tablename__ = 'sqlmodel_item' + + id: Optional[int] = Field(default=None, primary_key=True) + embedding: Optional[Any] = Field(default=None, sa_type=VECTOR(3)) + half_embedding: Optional[Any] = Field(default=None, sa_type=HALFVEC(3)) + binary_embedding: Optional[Any] = Field(default=None, sa_type=BIT(3)) + sparse_embedding: Optional[Any] = Field(default=None, sa_type=SPARSEVEC(3)) + + +SQLModel.metadata.drop_all(engine) +SQLModel.metadata.create_all(engine) + +index = Index( + 'sqlmodel_index', + Item.embedding, + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'vector_l2_ops'} +) +index.create(engine) + + +def create_items(): + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 1, 1], half_embedding=[1, 1, 1], binary_embedding='000', sparse_embedding=SparseVector([1, 1, 1]))) + session.add(Item(id=2, embedding=[2, 2, 2], half_embedding=[2, 2, 2], binary_embedding='101', sparse_embedding=SparseVector([2, 2, 2]))) + session.add(Item(id=3, embedding=[1, 1, 2], half_embedding=[1, 1, 2], binary_embedding='111', sparse_embedding=SparseVector([1, 1, 2]))) + session.commit() + + +class TestSqlmodel: + def setup_method(self): + with Session(engine) as session: + session.exec(delete(Item)) + session.commit() + + def test_orm(self): + item = Item(embedding=[1.5, 2, 3]) + item2 = Item(embedding=[4, 5, 6]) + item3 = Item() + + with Session(engine) as session: + session.add(item) + session.add(item2) + session.add(item3) + session.commit() + + stmt = select(Item) + with Session(engine) as session: + items = session.exec(stmt).all() + assert items[0].id == 1 + assert items[1].id == 2 + assert items[2].id == 3 + assert np.array_equal(items[0].embedding, np.array([1.5, 2, 3])) + assert items[0].embedding.dtype == np.float32 + assert np.array_equal(items[1].embedding, np.array([4, 5, 6])) + assert items[1].embedding.dtype == np.float32 + assert items[2].embedding is None + + def test_vector(self): + with Session(engine) as session: + session.add(Item(id=1, embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert np.array_equal(item.embedding, np.array([1, 2, 3])) + + def test_vector_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_vector_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_vector_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_vector_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec(self): + with Session(engine) as session: + session.add(Item(id=1, half_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.half_embedding == HalfVector([1, 2, 3]) + + def test_halfvec_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_halfvec_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_halfvec_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_halfvec_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.half_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_bit(self): + with Session(engine) as session: + session.add(Item(id=1, binary_embedding='101')) + session.commit() + item = session.get(Item, 1) + assert item.binary_embedding == '101' + + def test_bit_hamming_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.binary_embedding.hamming_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_bit_jaccard_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.binary_embedding.jaccard_distance('101'))) + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec(self): + with Session(engine) as session: + session.add(Item(id=1, sparse_embedding=[1, 2, 3])) + session.commit() + item = session.get(Item, 1) + assert item.sparse_embedding == SparseVector([1, 2, 3]) + + def test_sparsevec_l2_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.l2_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_sparsevec_max_inner_product(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.max_inner_product([1, 1, 1]))) + assert [v.id for v in items] == [2, 3, 1] + + def test_sparsevec_cosine_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.cosine_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 2, 3] + + def test_sparsevec_l1_distance(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) + assert [v.id for v in items] == [1, 3, 2] + + def test_filter(self): + create_items() + with Session(engine) as session: + items = session.exec(select(Item).filter(Item.embedding.l2_distance([1, 1, 1]) < 1)) + assert [v.id for v in items] == [1] + + def test_select(self): + with Session(engine) as session: + session.add(Item(embedding=[2, 3, 3])) + items = session.exec(select(Item.embedding.l2_distance([1, 1, 1]))).all() + assert items[0] == 3 + + def test_vector_avg(self): + with Session(engine) as session: + res = session.exec(select(avg(Item.embedding))).first() + assert res is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + res = session.exec(select(avg(Item.embedding))).first() + assert np.array_equal(res, np.array([2.5, 3.5, 4.5])) + + def test_vector_sum(self): + with Session(engine) as session: + res = session.exec(select(sum(Item.embedding))).first() + assert res is None + session.add(Item(embedding=[1, 2, 3])) + session.add(Item(embedding=[4, 5, 6])) + res = session.exec(select(sum(Item.embedding))).first() + assert np.array_equal(res, np.array([5, 7, 9])) + + def test_halfvec_avg(self): + with Session(engine) as session: + res = session.exec(select(avg(Item.half_embedding))).first() + assert res is None + session.add(Item(half_embedding=[1, 2, 3])) + session.add(Item(half_embedding=[4, 5, 6])) + res = session.exec(select(avg(Item.half_embedding))).first() + assert res == HalfVector([2.5, 3.5, 4.5]) + + def test_halfvec_sum(self): + with Session(engine) as session: + res = session.exec(select(sum(Item.half_embedding))).first() + assert res is None + session.add(Item(half_embedding=[1, 2, 3])) + session.add(Item(half_embedding=[4, 5, 6])) + res = session.exec(select(sum(Item.half_embedding))).first() + assert res == HalfVector([5, 7, 9]) + + def test_bad_dimensions(self): + item = Item(embedding=[1, 2]) + with Session(engine) as session: + session.add(item) + with pytest.raises(StatementError, match='expected 3 dimensions, not 2'): + session.commit() diff --git a/tests/test_vector.py b/tests/test_vector.py new file mode 100644 index 0000000..e5a16fe --- /dev/null +++ b/tests/test_vector.py @@ -0,0 +1,59 @@ +import numpy as np +from pgvector import Vector +import pytest +from struct import pack + + +class TestVector: + def test_list(self): + assert Vector([1, 2, 3]).to_list() == [1, 2, 3] + + def test_list_str(self): + with pytest.raises(ValueError, match='could not convert string to float'): + Vector([1, 'two', 3]) + + def test_tuple(self): + assert Vector((1, 2, 3)).to_list() == [1, 2, 3] + + def test_ndarray(self): + arr = np.array([1, 2, 3]) + assert Vector(arr).to_list() == [1, 2, 3] + assert Vector(arr).to_numpy() is not arr + + def test_ndarray_same_object(self): + arr = np.array([1, 2, 3], dtype='>f4') + assert Vector(arr).to_list() == [1, 2, 3] + assert Vector(arr).to_numpy() is arr + + def test_ndim_two(self): + with pytest.raises(ValueError) as error: + Vector([[1, 2], [3, 4]]) + assert str(error.value) == 'expected ndim to be 1' + + def test_ndim_zero(self): + with pytest.raises(ValueError) as error: + Vector(1) + assert str(error.value) == 'expected ndim to be 1' + + def test_repr(self): + assert repr(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' + assert str(Vector([1, 2, 3])) == 'Vector([1.0, 2.0, 3.0])' + + def test_equality(self): + assert Vector([1, 2, 3]) == Vector([1, 2, 3]) + assert Vector([1, 2, 3]) != Vector([1, 2, 4]) + + def test_dimensions(self): + assert Vector([1, 2, 3]).dimensions() == 3 + + def test_from_text(self): + vec = Vector.from_text('[1.5,2,3]') + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) + + def test_from_binary(self): + data = pack('>HH3f', 3, 0, 1.5, 2, 3) + vec = Vector.from_binary(data) + assert vec.to_list() == [1.5, 2, 3] + assert np.array_equal(vec.to_numpy(), [1.5, 2, 3]) + assert vec.to_binary() == data