diff --git a/.gitignore b/.gitignore index 4cc79a6a..65f7fffe 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ wheels/ # Virtual Environment venv/ +.venv/ env/ ENV/ diff --git a/migrations/versions/2025_03_03_1008-02b710eda156_add_persona_table.py b/migrations/versions/2025_03_03_1008-02b710eda156_add_persona_table.py new file mode 100644 index 00000000..e6b90a46 --- /dev/null +++ b/migrations/versions/2025_03_03_1008-02b710eda156_add_persona_table.py @@ -0,0 +1,50 @@ +"""add persona table + +Revision ID: 02b710eda156 +Revises: 5e5cd2288147 +Create Date: 2025-03-03 10:08:16.206617+00:00 + +""" + +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "02b710eda156" +down_revision: Union[str, None] = "5e5cd2288147" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Begin transaction + op.execute("BEGIN TRANSACTION;") + + op.execute( + """ + CREATE TABLE IF NOT EXISTS personas ( + id TEXT PRIMARY KEY, -- UUID stored as TEXT + name TEXT NOT NULL UNIQUE, + description TEXT NOT NULL, + description_embedding BLOB NOT NULL + ); + """ + ) + + # Finish transaction + op.execute("COMMIT;") + + +def downgrade() -> None: + # Begin transaction + op.execute("BEGIN TRANSACTION;") + + op.execute( + """ + DROP TABLE personas; + """ + ) + + # Finish transaction + op.execute("COMMIT;") diff --git a/migrations/versions/2025_03_04_0934-3ec2b4ab569c_migrate_to_glob_pattern.py b/migrations/versions/2025_03_04_0934-3ec2b4ab569c_migrate_to_glob_pattern.py new file mode 100644 index 00000000..9f090d1c --- /dev/null +++ b/migrations/versions/2025_03_04_0934-3ec2b4ab569c_migrate_to_glob_pattern.py @@ -0,0 +1,50 @@ +"""migrate to glob pattern + +Revision ID: 3ec2b4ab569c +Revises: 02b710eda156 +Create Date: 2025-03-04 09:34:09.966863+00:00 + +""" + +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "3ec2b4ab569c" +down_revision: Union[str, None] = "02b710eda156" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Begin transaction + op.execute("BEGIN TRANSACTION;") + + # Update the matcher blobs to use glob patterns + op.execute( + """ + UPDATE muxes + SET matcher_blob = '*' || matcher_blob + WHERE matcher_type LIKE "%filename%" AND matcher_blob LIKE ".%" + """ + ) + + # Finish transaction + op.execute("COMMIT;") + + +def downgrade() -> None: + # Begin transaction + op.execute("BEGIN TRANSACTION;") + + op.execute( + """ + UPDATE muxes + SET matcher_blob = SUBSTRING(matcher_blob, 2) + WHERE matcher_type LIKE "%filename%" AND matcher_blob LIKE "*%" + """ + ) + + # Finish transaction + op.execute("COMMIT;") diff --git a/poetry.lock b/poetry.lock index 8a96d49b..f757d84e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -6,6 +6,7 @@ version = "2.4.6" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"}, {file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"}, @@ -17,6 +18,7 @@ version = "3.11.12" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f"}, {file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854"}, @@ -111,7 +113,7 @@ propcache = ">=0.2.0" yarl = ">=1.17.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] +speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""] [[package]] name = "aiosignal" @@ -119,6 +121,7 @@ version = "1.3.2" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, @@ -133,6 +136,7 @@ version = "0.21.0" description = "asyncio bridge to the standard sqlite3 module" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0"}, {file = "aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3"}, @@ -151,6 +155,7 @@ version = "1.14.1" description = "A database migration tool for SQLAlchemy." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "alembic-1.14.1-py3-none-any.whl", hash = "sha256:1acdd7a3a478e208b0503cd73614d5e4c6efafa4e73518bb60e4f2846a37b1c5"}, {file = "alembic-1.14.1.tar.gz", hash = "sha256:496e888245a53adf1498fcab31713a469c65836f8de76e01399aa1c3e90dd213"}, @@ -162,7 +167,7 @@ SQLAlchemy = ">=1.3.0" typing-extensions = ">=4" [package.extras] -tz = ["backports.zoneinfo", "tzdata"] +tz = ["backports.zoneinfo ; python_version < \"3.9\"", "tzdata"] [[package]] name = "annotated-types" @@ -170,6 +175,7 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -181,6 +187,7 @@ version = "4.8.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a"}, {file = "anyio-4.8.0.tar.gz", hash = "sha256:1d9fe889df5212298c0c0723fa20479d1b94883a2df44bd3897aa91083316f7a"}, @@ -193,7 +200,7 @@ typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} [package.extras] doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1) ; python_version >= \"3.10\"", "uvloop (>=0.21) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\" and python_version < \"3.14\""] trio = ["trio (>=0.26.1)"] [[package]] @@ -202,18 +209,19 @@ version = "25.1.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, ] [package.extras] -benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] -tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] +tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""] [[package]] name = "azure-core" @@ -221,6 +229,7 @@ version = "1.32.0" description = "Microsoft Azure Core Library for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "azure_core-1.32.0-py3-none-any.whl", hash = "sha256:eac191a0efb23bfa83fddf321b27b122b4ec847befa3091fa736a5c32c50d7b4"}, {file = "azure_core-1.32.0.tar.gz", hash = "sha256:22b3c35d6b2dae14990f6c1be2912bf23ffe50b220e708a28ab1bb92b1c730e5"}, @@ -240,6 +249,7 @@ version = "1.8.3" description = "Security oriented static analyser for python code." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "bandit-1.8.3-py3-none-any.whl", hash = "sha256:28f04dc0d258e1dd0f99dee8eefa13d1cb5e3fde1a5ab0c523971f97b289bcd8"}, {file = "bandit-1.8.3.tar.gz", hash = "sha256:f5847beb654d309422985c36644649924e0ea4425c76dec2e89110b87506193a"}, @@ -255,7 +265,7 @@ stevedore = ">=1.20.0" baseline = ["GitPython (>=3.1.30)"] sarif = ["jschema-to-python (>=1.2.3)", "sarif-om (>=1.0.4)"] test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)", "flake8 (>=4.0.0)", "pylint (==1.9.4)", "stestr (>=2.5.0)", "testscenarios (>=0.5.0)", "testtools (>=2.3.0)"] -toml = ["tomli (>=1.1.0)"] +toml = ["tomli (>=1.1.0) ; python_version < \"3.11\""] yaml = ["PyYAML"] [[package]] @@ -264,6 +274,7 @@ version = "25.1.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "black-25.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32"}, {file = "black-25.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da"}, @@ -308,6 +319,7 @@ version = "0.7.11" description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "blis-0.7.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cd5fba34c5775e4c440d80e4dea8acb40e2d3855b546e07c4e21fad8f972404c"}, {file = "blis-0.7.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:31273d9086cab9c56986d478e3ed6da6752fa4cdd0f7b5e8e5db30827912d90d"}, @@ -354,6 +366,7 @@ version = "1.2.2.post1" description = "A simple, correct Python build frontend" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5"}, {file = "build-1.2.2.post1.tar.gz", hash = "sha256:b36993e92ca9375a219c99e606a122ff365a760a2d4bba0caa09bd5278b608b7"}, @@ -366,7 +379,7 @@ pyproject_hooks = "*" [package.extras] docs = ["furo (>=2023.08.17)", "sphinx (>=7.0,<8.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)", "sphinx-issues (>=3.0.0)"] -test = ["build[uv,virtualenv]", "filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "setuptools (>=56.0.0)", "setuptools (>=67.8.0)", "wheel (>=0.36.0)"] +test = ["build[uv,virtualenv]", "filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0) ; python_version < \"3.10\"", "setuptools (>=56.0.0) ; python_version == \"3.10\"", "setuptools (>=56.0.0) ; python_version == \"3.11\"", "setuptools (>=67.8.0) ; python_version >= \"3.12\"", "wheel (>=0.36.0)"] typing = ["build[uv]", "importlib-metadata (>=5.1)", "mypy (>=1.9.0,<1.10.0)", "tomli", "typing-extensions (>=3.7.4.3)"] uv = ["uv (>=0.1.18)"] virtualenv = ["virtualenv (>=20.0.35)"] @@ -377,6 +390,7 @@ version = "5.5.2" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, @@ -388,6 +402,7 @@ version = "2.0.10" description = "Super lightweight function registries for your library" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f"}, {file = "catalogue-2.0.10.tar.gz", hash = "sha256:4f56daa940913d3f09d589c191c74e5a6d51762b3a9e37dd53b7437afd6cda15"}, @@ -399,6 +414,7 @@ version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main", "dev"] files = [ {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, @@ -410,6 +426,8 @@ version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "platform_python_implementation != \"PyPy\"" files = [ {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, @@ -489,6 +507,7 @@ version = "3.4.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, @@ -590,6 +609,7 @@ version = "8.1.8" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, @@ -604,6 +624,7 @@ version = "0.20.0" description = "pathlib-style classes for cloud storage services." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "cloudpathlib-0.20.0-py3-none-any.whl", hash = "sha256:7af3bcefbf73392ae7f31c08b3660ec31607f8c01b7f6262d4d73469a845f641"}, {file = "cloudpathlib-0.20.0.tar.gz", hash = "sha256:f6ef7ca409a510f7ba4639ba50ab3fc5b6dee82d6dff0d7f5715fd0c9ab35891"}, @@ -621,10 +642,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "sys_platform == \"win32\" or platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or os_name == \"nt\" or sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -632,6 +655,7 @@ version = "15.0.1" description = "Colored terminal output for Python's logging module" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, @@ -649,6 +673,7 @@ version = "0.1.5" description = "The sweetest config system for Python" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "confection-0.1.5-py3-none-any.whl", hash = "sha256:e29d3c3f8eac06b3f77eb9dfb4bf2fc6bcc9622a98ca00a698e3d019c6430b14"}, {file = "confection-0.1.5.tar.gz", hash = "sha256:8e72dd3ca6bd4f48913cd220f10b8275978e740411654b6e8ca6d7008c590f0e"}, @@ -664,6 +689,7 @@ version = "7.6.12" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "coverage-7.6.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:704c8c8c6ce6569286ae9622e534b4f5b9759b6f2cd643f1c1a61f666d534fe8"}, {file = "coverage-7.6.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ad7525bf0241e5502168ae9c643a2f6c219fa0a283001cee4cf23a9b7da75879"}, @@ -731,7 +757,7 @@ files = [ ] [package.extras] -toml = ["tomli"] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] [[package]] name = "cryptography" @@ -739,6 +765,7 @@ version = "44.0.2" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.7" +groups = ["main"] files = [ {file = "cryptography-44.0.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:efcfe97d1b3c79e486554efddeb8f6f53a4cdd4cf6086642784fa31fc384e1d7"}, {file = "cryptography-44.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29ecec49f3ba3f3849362854b7253a9f59799e3763b0c9d0826259a88efa02f1"}, @@ -781,10 +808,10 @@ files = [ cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0)"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""] docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] -nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] -pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""] +pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] test = ["certifi (>=2024)", "cryptography-vectors (==44.0.2)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] @@ -796,6 +823,7 @@ version = "2.0.11" description = "Manage calls to calloc/free through Cython" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "cymem-2.0.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b4dd8f8c2475c7c9948eefa89c790d83134600858d8d43b90276efd8df3882e"}, {file = "cymem-2.0.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d46ba0d2e0f749195297d16f2286b55af7d7c084db2b853fdfccece2c000c5dc"}, @@ -841,6 +869,7 @@ version = "5.6.3" description = "Disk Cache -- Disk and file backed persistent cache." optional = false python-versions = ">=3" +groups = ["main", "dev"] files = [ {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"}, {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"}, @@ -852,6 +881,7 @@ version = "1.9.0" description = "Distro - an OS platform information API" optional = false python-versions = ">=3.6" +groups = ["main", "dev"] files = [ {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, @@ -863,6 +893,7 @@ version = "3.8.0" description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "en_core_web_sm-3.8.0-py3-none-any.whl", hash = "sha256:1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85"}, ] @@ -877,6 +908,7 @@ version = "0.115.11" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "fastapi-0.115.11-py3-none-any.whl", hash = "sha256:32e1541b7b74602e4ef4a0260ecaf3aadf9d4f19590bba3e1bf2ac4666aa2c64"}, {file = "fastapi-0.115.11.tar.gz", hash = "sha256:cc81f03f688678b92600a65a5e618b93592c65005db37157147204d8924bf94f"}, @@ -897,6 +929,7 @@ version = "3.17.0" description = "A platform independent file lock." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"}, {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"}, @@ -905,7 +938,7 @@ files = [ [package.extras] docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] -typing = ["typing-extensions (>=4.12.2)"] +typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "flatbuffers" @@ -913,6 +946,7 @@ version = "25.2.10" description = "The FlatBuffers serialization format for Python" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "flatbuffers-25.2.10-py2.py3-none-any.whl", hash = "sha256:ebba5f4d5ea615af3f7fd70fc310636fbb2bbd1f566ac0a23d98dd412de50051"}, {file = "flatbuffers-25.2.10.tar.gz", hash = "sha256:97e451377a41262f8d9bd4295cc836133415cc03d8cb966410a4af92eb00d26e"}, @@ -924,6 +958,7 @@ version = "1.5.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"}, {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"}, @@ -1025,6 +1060,7 @@ version = "2025.2.0" description = "File-system specification" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b"}, {file = "fsspec-2025.2.0.tar.gz", hash = "sha256:1c24b16eaa0a1798afa0337aa0db9b256718ab2a89c425371f5628d22c3b6afd"}, @@ -1064,6 +1100,7 @@ version = "3.1.1" description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, @@ -1150,6 +1187,7 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -1161,6 +1199,7 @@ version = "1.0.7" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"}, {file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"}, @@ -1182,6 +1221,7 @@ version = "0.28.1" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, @@ -1194,7 +1234,7 @@ httpcore = "==1.*" idna = "*" [package.extras] -brotli = ["brotli", "brotlicffi"] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] @@ -1206,6 +1246,7 @@ version = "0.28.1" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" +groups = ["main", "dev"] files = [ {file = "huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7"}, {file = "huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae"}, @@ -1240,6 +1281,7 @@ version = "10.0" description = "Human friendly output for text interfaces using Python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, @@ -1254,6 +1296,7 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" +groups = ["main", "dev"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, @@ -1268,6 +1311,7 @@ version = "8.6.1" description = "Read metadata from Python packages" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e"}, {file = "importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580"}, @@ -1277,12 +1321,12 @@ files = [ zipp = ">=3.20" [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] perf = ["ipython"] -test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +test = ["flufl.flake8", "importlib_resources (>=1.3) ; python_version < \"3.9\"", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["pytest-mypy"] [[package]] @@ -1291,6 +1335,7 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -1302,6 +1347,7 @@ version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, @@ -1319,6 +1365,7 @@ version = "0.8.2" description = "Fast iterable JSON parser." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "jiter-0.8.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ca8577f6a413abe29b079bc30f907894d7eb07a865c4df69475e868d73e71c7b"}, {file = "jiter-0.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b25bd626bde7fb51534190c7e3cb97cee89ee76b76d7585580e22f34f5e3f393"}, @@ -1404,6 +1451,7 @@ version = "1.4.2" description = "Lightweight pipelining with Python functions" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, @@ -1415,6 +1463,7 @@ version = "4.23.0" description = "An implementation of JSON Schema validation for Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, @@ -1436,6 +1485,7 @@ version = "2024.10.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"}, {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"}, @@ -1450,6 +1500,7 @@ version = "3.5.0" description = "Tools for labeling human languages with IETF language tags" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "langcodes-3.5.0-py3-none-any.whl", hash = "sha256:853c69d1a35e0e13da2f427bb68fb2fa4a8f4fb899e0c62ad8df8d073dcfed33"}, {file = "langcodes-3.5.0.tar.gz", hash = "sha256:1eef8168d07e51e131a2497ffecad4b663f6208e7c3ae3b8dc15c51734a6f801"}, @@ -1468,6 +1519,7 @@ version = "1.3.0" description = "Supplementary data about languages used by the langcodes module" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "language_data-1.3.0-py3-none-any.whl", hash = "sha256:e2ee943551b5ae5f89cd0e801d1fc3835bb0ef5b7e9c3a4e8e17b2b214548fbf"}, {file = "language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec"}, @@ -1486,6 +1538,7 @@ version = "2.6.2" description = "Fork of the standard library cgi and cgitb modules, being deprecated in PEP-594" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "legacy_cgi-2.6.2-py3-none-any.whl", hash = "sha256:a7b83afb1baf6ebeb56522537c5943ef9813cf933f6715e88a803f7edbce0bff"}, {file = "legacy_cgi-2.6.2.tar.gz", hash = "sha256:9952471ceb304043b104c22d00b4f333cac27a6abe446d8a528fc437cf13c85f"}, @@ -1493,13 +1546,14 @@ files = [ [[package]] name = "litellm" -version = "1.61.20" +version = "1.62.1" description = "Library to easily interface with LLM API providers" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" +groups = ["main", "dev"] files = [ - {file = "litellm-1.61.20-py3-none-any.whl", hash = "sha256:8158f96ceda0d76bb59a59d868686e888e32d66b2380e149c6a7a0746f7a5bc9"}, - {file = "litellm-1.61.20.tar.gz", hash = "sha256:0b0204f56e08c92efd2f9e4bfb850c25eaa95fb03a56aaa21e5e29b2391c9067"}, + {file = "litellm-1.62.1-py3-none-any.whl", hash = "sha256:f576358c72b477207d1f45ce5ac895ede7bd84377f6420a6b522909c829a79dc"}, + {file = "litellm-1.62.1.tar.gz", hash = "sha256:eee9cc40dc9c1da7e411af2f4ef145a67bb61702ae4e1218c1bc15b9e6404daa"}, ] [package.dependencies] @@ -1525,6 +1579,7 @@ version = "0.3.5" description = "Python bindings for the llama.cpp library" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "llama_cpp_python-0.3.5.tar.gz", hash = "sha256:f5ce47499d53d3973e28ca5bdaf2dfe820163fa3fb67e3050f98e2e9b58d2cf6"}, ] @@ -1547,6 +1602,7 @@ version = "1.3.9" description = "A super-fast templating language that borrows the best ideas from the existing templating languages." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "Mako-1.3.9-py3-none-any.whl", hash = "sha256:95920acccb578427a9aa38e37a186b1e43156c87260d7ba18ca63aa4c7cbd3a1"}, {file = "mako-1.3.9.tar.gz", hash = "sha256:b5d65ff3462870feec922dbccf38f6efb44e5714d7b593a656be86663d8600ac"}, @@ -1566,6 +1622,7 @@ version = "1.2.1" description = "Static memory-efficient and fast Trie-like structures for Python." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "marisa_trie-1.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a2eb41d2f9114d8b7bd66772c237111e00d2bae2260824560eaa0a1e291ce9e8"}, {file = "marisa_trie-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e956e6a46f604b17d570901e66f5214fb6f658c21e5e7665deace236793cef6"}, @@ -1657,6 +1714,7 @@ version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, @@ -1681,6 +1739,7 @@ version = "3.0.2" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, @@ -1751,6 +1810,7 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -1762,6 +1822,7 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -1770,7 +1831,7 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] @@ -1779,6 +1840,7 @@ version = "6.1.0" description = "multidict implementation" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"}, {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"}, @@ -1880,6 +1942,7 @@ version = "1.0.12" description = "Cython bindings for MurmurHash" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "murmurhash-1.0.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3f492bbf6f879b6eaf9da4be7471f4b68a3e3ae525aac0f35c2ae27ec91265c"}, {file = "murmurhash-1.0.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3493e0c10a64fa72026af2ea2271d8b3511a438de3c6a771b7a57771611b9c08"}, @@ -1925,6 +1988,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -1936,6 +2000,7 @@ version = "1.26.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, @@ -1981,6 +2046,7 @@ version = "0.4.7" description = "The official Python client for Ollama." optional = false python-versions = "<4.0,>=3.8" +groups = ["main"] files = [ {file = "ollama-0.4.7-py3-none-any.whl", hash = "sha256:85505663cca67a83707be5fb3aeff0ea72e67846cea5985529d8eca4366564a1"}, {file = "ollama-0.4.7.tar.gz", hash = "sha256:891dcbe54f55397d82d289c459de0ea897e103b86a3f1fad0fdb1895922a75ff"}, @@ -1996,6 +2062,7 @@ version = "1.17.0" description = "Open Neural Network Exchange" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "onnx-1.17.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:38b5df0eb22012198cdcee527cc5f917f09cce1f88a69248aaca22bd78a7f023"}, {file = "onnx-1.17.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d545335cb49d4d8c47cc803d3a805deb7ad5d9094dc67657d66e568610a36d7d"}, @@ -2038,6 +2105,7 @@ version = "1.20.1" description = "ONNX Runtime is a runtime accelerator for Machine Learning models" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "onnxruntime-1.20.1-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:e50ba5ff7fed4f7d9253a6baf801ca2883cc08491f9d32d78a80da57256a5439"}, {file = "onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de"}, @@ -2076,6 +2144,7 @@ version = "1.61.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "openai-1.61.1-py3-none-any.whl", hash = "sha256:72b0826240ce26026ac2cd17951691f046e5be82ad122d20a8e1b30ca18bd11e"}, {file = "openai-1.61.1.tar.gz", hash = "sha256:ce1851507218209961f89f3520e06726c0aa7d0512386f0f977e3ac3e4f2472e"}, @@ -2101,6 +2170,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -2112,6 +2182,7 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -2123,6 +2194,7 @@ version = "6.1.1" description = "Python Build Reasonableness" optional = false python-versions = ">=2.6" +groups = ["dev"] files = [ {file = "pbr-6.1.1-py2.py3-none-any.whl", hash = "sha256:38d4daea5d9fa63b3f626131b9d34947fd0c8be9b05a29276870580050a25a76"}, {file = "pbr-6.1.1.tar.gz", hash = "sha256:93ea72ce6989eb2eed99d0f75721474f69ad88128afdef5ac377eb797c4bf76b"}, @@ -2137,6 +2209,7 @@ version = "8.13.54" description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "phonenumbers-8.13.54-py2.py3-none-any.whl", hash = "sha256:97624ada7260daafd09538baa6574b14cb9151cf29c5b22d9278abd050957edf"}, {file = "phonenumbers-8.13.54.tar.gz", hash = "sha256:4c32e3c941b24e5ce28d2211f624f0fef08462781e3d7e5e85192275cfd6c680"}, @@ -2148,6 +2221,7 @@ version = "4.3.6" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, @@ -2164,6 +2238,7 @@ version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, @@ -2179,6 +2254,7 @@ version = "3.0.9" description = "Cython hash table that trusts the keys are pre-hashed" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "preshed-3.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f96ef4caf9847b2bb9868574dcbe2496f974e41c2b83d6621c24fb4c3fc57e3"}, {file = "preshed-3.0.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a61302cf8bd30568631adcdaf9e6b21d40491bd89ba8ebf67324f98b6c2a2c05"}, @@ -2225,6 +2301,7 @@ version = "2.2.357" description = "Presidio Analyzer package" optional = false python-versions = "<4.0,>=3.9" +groups = ["main"] files = [ {file = "presidio_analyzer-2.2.357-py3-none-any.whl", hash = "sha256:e7c545dcedb46c497ebd572578804ef7785c0628b85419c25ab947be05430483"}, ] @@ -2238,7 +2315,7 @@ tldextract = "*" [package.extras] azure-ai-language = ["azure-ai-textanalytics", "azure-core"] -gliner = ["gliner (>=0.2.13,<1.0.0)", "huggingface_hub", "onnxruntime-gpu (>=1.19)", "transformers"] +gliner = ["gliner (>=0.2.13,<1.0.0) ; python_version >= \"3.10\"", "huggingface_hub", "onnxruntime-gpu (>=1.19) ; python_version >= \"3.10\"", "transformers"] server = ["flask (>=1.1)", "gunicorn"] stanza = ["spacy_stanza", "stanza"] transformers = ["huggingface_hub", "spacy_huggingface_pipelines", "transformers"] @@ -2249,6 +2326,7 @@ version = "2.2.357" description = "Presidio Anonymizer package - replaces analyzed text with desired values." optional = false python-versions = "<4.0,>=3.9" +groups = ["main"] files = [ {file = "presidio_anonymizer-2.2.357-py3-none-any.whl", hash = "sha256:0b3e5e0526f5950bb9b27941e5b1b01b6761295d178a8ba4cedd2771aa2aee52"}, ] @@ -2266,6 +2344,7 @@ version = "0.2.1" description = "Accelerated property cache" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "propcache-0.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6b3f39a85d671436ee3d12c017f8fdea38509e4f25b28eb25877293c98c243f6"}, {file = "propcache-0.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d51fbe4285d5db5d92a929e3e21536ea3dd43732c5b177c7ef03f918dff9f2"}, @@ -2357,6 +2436,7 @@ version = "5.29.3" description = "" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"}, {file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"}, @@ -2377,6 +2457,8 @@ version = "2.22" description = "C parser in Python" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "platform_python_implementation != \"PyPy\"" files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, @@ -2388,6 +2470,7 @@ version = "3.21.0" description = "Cryptographic library for Python" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main"] files = [ {file = "pycryptodome-3.21.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:dad9bf36eda068e89059d1f07408e397856be9511d7113ea4b586642a429a4fd"}, {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:a1752eca64c60852f38bb29e2c86fca30d7672c024128ef5d70cc15868fa10f4"}, @@ -2429,6 +2512,7 @@ version = "2.10.6" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"}, {file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"}, @@ -2441,7 +2525,7 @@ typing-extensions = ">=4.12.2" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] [[package]] name = "pydantic-core" @@ -2449,6 +2533,7 @@ version = "2.27.2" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"}, {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"}, @@ -2561,6 +2646,7 @@ version = "2.8.1" description = "Settings management using Pydantic" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"}, {file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"}, @@ -2581,6 +2667,7 @@ version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, @@ -2595,6 +2682,7 @@ version = "1.2.0" description = "Wrappers to call pyproject.toml-based build backend hooks." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913"}, {file = "pyproject_hooks-1.2.0.tar.gz", hash = "sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8"}, @@ -2606,6 +2694,8 @@ version = "3.5.4" description = "A python implementation of GNU readline." optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "sys_platform == \"win32\"" files = [ {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"}, {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"}, @@ -2620,6 +2710,7 @@ version = "8.3.5" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, @@ -2640,6 +2731,7 @@ version = "0.25.3" description = "Pytest support for asyncio" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3"}, {file = "pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a"}, @@ -2658,6 +2750,7 @@ version = "6.0.0" description = "Pytest plugin for measuring coverage." optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "pytest-cov-6.0.0.tar.gz", hash = "sha256:fde0b595ca248bb8e2d76f020b465f3b107c9632e6a1d1705f17834c89dcadc0"}, {file = "pytest_cov-6.0.0-py3-none-any.whl", hash = "sha256:eee6f1b9e61008bd34975a4d5bab25801eb31898b032dd55addc93e96fcaaa35"}, @@ -2676,6 +2769,7 @@ version = "1.0.1" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, @@ -2690,6 +2784,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -2752,6 +2847,7 @@ version = "0.36.2" description = "JSON Referencing + Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"}, {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"}, @@ -2768,6 +2864,7 @@ version = "2024.11.6" description = "Alternative regular expression module, to replace re." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, @@ -2871,6 +2968,7 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -2892,6 +2990,7 @@ version = "2.1.0" description = "File transport adapter for Requests" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, @@ -2906,6 +3005,7 @@ version = "13.9.4" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.8.0" +groups = ["dev"] files = [ {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, @@ -2924,6 +3024,7 @@ version = "0.22.3" description = "Python bindings to Rust's persistent data structures (rpds)" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "rpds_py-0.22.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6c7b99ca52c2c1752b544e310101b98a659b720b21db00e65edca34483259967"}, {file = "rpds_py-0.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be2eb3f2495ba669d2a985f9b426c1797b7d48d6963899276d22f23e33d47e37"}, @@ -3036,6 +3137,7 @@ version = "0.9.9" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "ruff-0.9.9-py3-none-linux_armv6l.whl", hash = "sha256:628abb5ea10345e53dff55b167595a159d3e174d6720bf19761f5e467e68d367"}, {file = "ruff-0.9.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b6cd1428e834b35d7493354723543b28cc11dc14d1ce19b685f6e68e07c05ec7"}, @@ -3063,6 +3165,7 @@ version = "1.6.1" description = "A set of python modules for machine learning and data mining" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e"}, {file = "scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36"}, @@ -3117,6 +3220,7 @@ version = "1.15.1" description = "Fundamental algorithms for scientific computing in Python" optional = false python-versions = ">=3.10" +groups = ["dev"] files = [ {file = "scipy-1.15.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:c64ded12dcab08afff9e805a67ff4480f5e69993310e093434b10e85dc9d43e1"}, {file = "scipy-1.15.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5b190b935e7db569960b48840e5bef71dc513314cc4e79a1b7d14664f57fd4ff"}, @@ -3166,7 +3270,7 @@ numpy = ">=1.23.5,<2.5" [package.extras] dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.16.5)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.0.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] -test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja ; sys_platform != \"emscripten\"", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] [[package]] name = "setuptools" @@ -3174,19 +3278,20 @@ version = "75.8.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"}, {file = "setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] -core = ["importlib_metadata (>=6)", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.collections", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "six" @@ -3194,6 +3299,7 @@ version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -3205,6 +3311,7 @@ version = "6.4.0" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" optional = false python-versions = ">=3.6,<4.0" +groups = ["main"] files = [ {file = "smart_open-6.4.0-py3-none-any.whl", hash = "sha256:8d3ef7e6997e8e42dd55c74166ed21e6ac70664caa32dd940b26d54a8f6b4142"}, {file = "smart_open-6.4.0.tar.gz", hash = "sha256:be3c92c246fbe80ebce8fbacb180494a481a77fcdcb7c1aadb2ea5b9c2bee8b9"}, @@ -3226,6 +3333,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -3237,6 +3345,7 @@ version = "3.7.5" description = "Industrial-strength Natural Language Processing (NLP) in Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "spacy-3.7.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8002897701429ee2ab5ff6921ae43560f4cd17184cb1e10dad761901c12dcb85"}, {file = "spacy-3.7.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:43acd19efc845e9126b61a05ed7508a0aff509e96e15563f30f810c19e636b7c"}, @@ -3324,6 +3433,7 @@ version = "3.0.12" description = "Legacy registered functions for spaCy backwards compatibility" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"}, {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"}, @@ -3335,6 +3445,7 @@ version = "1.0.5" description = "Logging utilities for SpaCy" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24"}, {file = "spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645"}, @@ -3346,6 +3457,7 @@ version = "2.0.38" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "SQLAlchemy-2.0.38-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5e1d9e429028ce04f187a9f522818386c8b076723cdbe9345708384f49ebcec6"}, {file = "SQLAlchemy-2.0.38-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b87a90f14c68c925817423b0424381f0e16d80fc9a1a1046ef202ab25b19a444"}, @@ -3441,6 +3553,7 @@ version = "0.0.4" description = "" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "sqlite_vec_sl_tmp-0.0.4-py3-none-macosx_10_6_x86_64.whl", hash = "sha256:5ff08375a51d9d8284b4e14a6a2ccb8faabc5fe8e82953b8a8861302ef2ab147"}, {file = "sqlite_vec_sl_tmp-0.0.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:0a8ad2980e95067560670c24afc6a6ba43227387f8c38e833ae8c7d9382080f2"}, @@ -3455,6 +3568,7 @@ version = "2.5.1" description = "Modern high-performance serialization utilities for Python" optional = false python-versions = "<3.14,>=3.9" +groups = ["main"] files = [ {file = "srsly-2.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d0cda6f65cc0dd1daf47e856b0d6c5d51db8a9343c5007723ca06903dcfe367d"}, {file = "srsly-2.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf643e6f45c266cfacea54997a1f9cfe0113fadac1ac21a1ec5b200cfe477ba0"}, @@ -3503,6 +3617,7 @@ version = "0.45.3" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"}, {file = "starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f"}, @@ -3520,6 +3635,7 @@ version = "5.4.0" description = "Manage dynamic plugins for Python applications" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "stevedore-5.4.0-py3-none-any.whl", hash = "sha256:b0be3c4748b3ea7b854b265dcb4caa891015e442416422be16f8b31756107857"}, {file = "stevedore-5.4.0.tar.gz", hash = "sha256:79e92235ecb828fe952b6b8b0c6c87863248631922c8e8e0fa5b17b232c4514d"}, @@ -3534,6 +3650,7 @@ version = "25.1.0" description = "Structured Logging for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "structlog-25.1.0-py3-none-any.whl", hash = "sha256:843fe4f254540329f380812cbe612e1af5ec5b8172205ae634679cd35a6d6321"}, {file = "structlog-25.1.0.tar.gz", hash = "sha256:2ef2a572e0e27f09664965d31a576afe64e46ac6084ef5cec3c2b8cd6e4e3ad3"}, @@ -3551,6 +3668,7 @@ version = "1.13.3" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"}, {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"}, @@ -3568,6 +3686,7 @@ version = "8.2.5" description = "A refreshing functional take on deep learning, compatible with your favorite libraries" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "thinc-8.2.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dc267f6aad80a681a85f50383afe91da9e2bec56fefdda86bfa2e4f529bef191"}, {file = "thinc-8.2.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d80f1e497971c9fa0938f5cc8fe607bbe87356b405fb7bbc3ff9f32fb4eed3bb"}, @@ -3637,6 +3756,7 @@ version = "3.5.0" description = "threadpoolctl" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, @@ -3648,6 +3768,7 @@ version = "0.8.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "tiktoken-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b07e33283463089c81ef1467180e3e00ab00d46c2c4bbcef0acab5f771d6695e"}, {file = "tiktoken-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9269348cb650726f44dd3bbb3f9110ac19a8dcc8f54949ad3ef652ca22a38e21"}, @@ -3695,6 +3816,7 @@ version = "5.1.3" description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tldextract-5.1.3-py3-none-any.whl", hash = "sha256:78de310cc2ca018692de5ddf320f9d6bd7c5cf857d0fd4f2175f0cdf4440ea75"}, {file = "tldextract-5.1.3.tar.gz", hash = "sha256:d43c7284c23f5dc8a42fd0fee2abede2ff74cc622674e4cb07f514ab3330c338"}, @@ -3716,6 +3838,7 @@ version = "0.21.0" description = "" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c4c93eae637e7d2aaae3d376f06085164e1660f89304c0ab2b1d08a406636b2"}, {file = "tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f53ea537c925422a2e0e92a24cce96f6bc5046bbef24a1652a5edc8ba975f62e"}, @@ -3748,6 +3871,7 @@ version = "4.67.1" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, @@ -3769,6 +3893,7 @@ version = "0.24.0" description = "Python bindings to the Tree-sitter parsing library" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734"}, {file = "tree_sitter-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3f00feff1fc47a8e4863561b8da8f5e023d382dd31ed3e43cd11d4cae445445"}, @@ -3811,6 +3936,7 @@ version = "0.23.4" description = "Go grammar for tree-sitter" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tree_sitter_go-0.23.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c9320f87a05cd47fa0f627b9329bbc09b7ed90de8fe4f5882aed318d6e19962d"}, {file = "tree_sitter_go-0.23.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:914e63d16b36ab0e4f52b031e574b82d17d0bbfecca138ae83e887a1cf5b71ac"}, @@ -3831,6 +3957,7 @@ version = "0.23.5" description = "Java grammar for tree-sitter" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tree_sitter_java-0.23.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:355ce0308672d6f7013ec913dee4a0613666f4cda9044a7824240d17f38209df"}, {file = "tree_sitter_java-0.23.5-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:24acd59c4720dedad80d548fe4237e43ef2b7a4e94c8549b0ca6e4c4d7bf6e69"}, @@ -3851,6 +3978,7 @@ version = "0.23.1" description = "JavaScript grammar for tree-sitter" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tree_sitter_javascript-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6ca583dad4bd79d3053c310b9f7208cd597fd85f9947e4ab2294658bb5c11e35"}, {file = "tree_sitter_javascript-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:94100e491a6a247aa4d14caf61230c171b6376c863039b6d9cd71255c2d815ec"}, @@ -3871,6 +3999,7 @@ version = "0.23.6" description = "Python grammar for tree-sitter" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb"}, {file = "tree_sitter_python-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:680b710051b144fedf61c95197db0094f2245e82551bf7f0c501356333571f7a"}, @@ -3891,6 +4020,7 @@ version = "0.23.2" description = "Rust grammar for tree-sitter" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "tree_sitter_rust-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b6b26a4c07ddc243f3701450ff34093b8e3b08f14d269db2d049c625d151677c"}, {file = "tree_sitter_rust-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c6224f608df559d75425e5ef428f635b9fb87d7aa8716444915ee67ec6955085"}, @@ -3911,6 +4041,7 @@ version = "0.9.4" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "typer-0.9.4-py3-none-any.whl", hash = "sha256:aa6c4a4e2329d868b80ecbaf16f807f2b54e192209d7ac9dd42691d63f7a54eb"}, {file = "typer-0.9.4.tar.gz", hash = "sha256:f714c2d90afae3a7929fcd72a3abb08df305e1ff61719381384211c4070af57f"}, @@ -3932,6 +4063,7 @@ version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, @@ -3943,13 +4075,14 @@ version = "2.3.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -3960,6 +4093,7 @@ version = "0.34.0" description = "The lightning-fast ASGI server." optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "uvicorn-0.34.0-py3-none-any.whl", hash = "sha256:023dc038422502fa28a09c7a30bf2b6991512da7dcdb8fd35fe57cfc154126f4"}, {file = "uvicorn-0.34.0.tar.gz", hash = "sha256:404051050cd7e905de2c9a7e61790943440b3416f49cb409f965d9dcd0fa73e9"}, @@ -3970,7 +4104,7 @@ click = ">=7.0" h11 = ">=0.8" [package.extras] -standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] +standard = ["colorama (>=0.4) ; sys_platform == \"win32\"", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1) ; sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"", "watchfiles (>=0.13)", "websockets (>=10.4)"] [[package]] name = "wasabi" @@ -3978,6 +4112,7 @@ version = "1.1.3" description = "A lightweight console printing and formatting toolkit" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c"}, {file = "wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878"}, @@ -3992,6 +4127,7 @@ version = "0.4.1" description = "Weasel: A small and easy workflow system" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c"}, {file = "weasel-0.4.1.tar.gz", hash = "sha256:aabc210f072e13f6744e5c3a28037f93702433405cd35673f7c6279147085aa9"}, @@ -4014,6 +4150,7 @@ version = "0.45.1" description = "A built-package format for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248"}, {file = "wheel-0.45.1.tar.gz", hash = "sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729"}, @@ -4028,6 +4165,7 @@ version = "1.18.3" description = "Yet another URL library" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "yarl-1.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34"}, {file = "yarl-1.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7"}, @@ -4124,20 +4262,21 @@ version = "3.21.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +test = ["big-O", "importlib-resources ; python_version < \"3.9\"", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.12,<3.13" -content-hash = "9da14898a535ae979076da7fff7d0b8b9cfaab169d7396784baf1539df45038d" +content-hash = "1cae360ec3078b2da000dfea1d112e32256502aa9b3e2a4d8ca919384a49aff6" diff --git a/pyproject.toml b/pyproject.toml index 28e0cc1c..a6748387 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ PyYAML = "==6.0.2" fastapi = "==0.115.11" uvicorn = "==0.34.0" structlog = "==25.1.0" -litellm = "==1.61.20" +litellm = "==1.62.1" llama_cpp_python = "==0.3.5" cryptography = "==44.0.2" sqlalchemy = "==2.0.38" @@ -50,7 +50,7 @@ ruff = "==0.9.9" bandit = "==1.8.3" build = "==1.2.2.post1" wheel = "==0.45.1" -litellm = "==1.61.20" +litellm = "==1.62.1" pytest-asyncio = "==0.25.3" llama_cpp_python = "==0.3.5" scikit-learn = "==1.6.1" diff --git a/src/codegate/cli.py b/src/codegate/cli.py index be5096f6..455d9001 100644 --- a/src/codegate/cli.py +++ b/src/codegate/cli.py @@ -16,7 +16,7 @@ from codegate.config import Config, ConfigurationError from codegate.db.connection import init_db_sync, init_session_if_not_exists from codegate.pipeline.factory import PipelineFactory -from codegate.pipeline.secrets.manager import SecretsManager +from codegate.pipeline.sensitive_data.manager import SensitiveDataManager from codegate.providers import crud as provendcrud from codegate.providers.copilot.provider import CopilotProvider from codegate.server import init_app @@ -331,8 +331,8 @@ def serve( # noqa: C901 click.echo("Existing Certificates are already present.") # Initialize secrets manager and pipeline factory - secrets_manager = SecretsManager() - pipeline_factory = PipelineFactory(secrets_manager) + sensitive_data_manager = SensitiveDataManager() + pipeline_factory = PipelineFactory(sensitive_data_manager) app = init_app(pipeline_factory) diff --git a/src/codegate/config.py b/src/codegate/config.py index 11cd96bf..761ca09e 100644 --- a/src/codegate/config.py +++ b/src/codegate/config.py @@ -57,6 +57,9 @@ class Config: force_certs: bool = False max_fim_hash_lifetime: int = 60 * 5 # Time in seconds. Default is 5 minutes. + # Min value is 0 (max similarity), max value is 2 (orthogonal) + # The value 0.75 was found through experimentation. See /tests/muxing/test_semantic_router.py + persona_threshold = 0.75 # Provider URLs with defaults provider_urls: Dict[str, str] = field(default_factory=lambda: DEFAULT_PROVIDER_URLS.copy()) diff --git a/src/codegate/db/connection.py b/src/codegate/db/connection.py index 2d56fccd..803943b3 100644 --- a/src/codegate/db/connection.py +++ b/src/codegate/db/connection.py @@ -1,9 +1,12 @@ import asyncio import json +import sqlite3 import uuid from pathlib import Path from typing import Dict, List, Optional, Type +import numpy as np +import sqlite_vec_sl_tmp import structlog from alembic import command as alembic_command from alembic.config import Config as AlembicConfig @@ -22,6 +25,9 @@ IntermediatePromptWithOutputUsageAlerts, MuxRule, Output, + Persona, + PersonaDistance, + PersonaEmbedding, Prompt, ProviderAuthMaterial, ProviderEndpoint, @@ -65,7 +71,7 @@ def __new__(cls, *args, **kwargs): # It should only be used for testing if "_no_singleton" in kwargs and kwargs["_no_singleton"]: kwargs.pop("_no_singleton") - return super().__new__(cls, *args, **kwargs) + return super().__new__(cls) if cls._instance is None: cls._instance = super().__new__(cls) @@ -92,6 +98,22 @@ def __init__(self, sqlite_path: Optional[str] = None, **kwargs): } self._async_db_engine = create_async_engine(**engine_dict) + def _get_vec_db_connection(self): + """ + Vector database connection is a separate connection to the SQLite database. aiosqlite + does not support loading extensions, so we need to use the sqlite3 module to load the + vector extension. + """ + try: + conn = sqlite3.connect(self._db_path) + conn.enable_load_extension(True) + sqlite_vec_sl_tmp.load(conn) + conn.enable_load_extension(False) + return conn + except Exception: + logger.exception("Failed to initialize vector database connection") + raise + def does_db_exist(self): return self._db_path.is_file() @@ -523,6 +545,30 @@ async def add_mux(self, mux: MuxRule) -> MuxRule: added_mux = await self._execute_update_pydantic_model(mux, sql, should_raise=True) return added_mux + async def add_persona(self, persona: PersonaEmbedding) -> None: + """Add a new Persona to the DB. + + This handles validation and insertion of a new persona. + + It may raise a AlreadyExistsError if the persona already exists. + """ + sql = text( + """ + INSERT INTO personas (id, name, description, description_embedding) + VALUES (:id, :name, :description, :description_embedding) + """ + ) + + try: + # For Pydantic we convert the numpy array to string when serializing with .model_dumpy() + # We need to convert it back to a numpy array before inserting it into the DB. + persona_dict = persona.model_dump() + persona_dict["description_embedding"] = persona.description_embedding + await self._execute_with_no_return(sql, persona_dict) + except IntegrityError as e: + logger.debug(f"Exception type: {type(e)}") + raise AlreadyExistsError(f"Persona '{persona.name}' already exists.") + class DbReader(DbCodeGate): def __init__(self, sqlite_path: Optional[str] = None, *args, **kwargs): @@ -569,6 +615,20 @@ async def _exec_select_conditions_to_pydantic( raise e return None + async def _exec_vec_db_query_to_pydantic( + self, sql_command: str, conditions: dict, model_type: Type[BaseModel] + ) -> List[BaseModel]: + """ + Execute a query on the vector database. This is a separate connection to the SQLite + database that has the vector extension loaded. + """ + conn = self._get_vec_db_connection() + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + results = [model_type(**row) for row in cursor.execute(sql_command, conditions)] + conn.close() + return results + async def get_prompts_with_output(self, workpace_id: str) -> List[GetPromptWithOutputsRow]: sql = text( """ @@ -893,6 +953,45 @@ async def get_muxes_by_workspace(self, workspace_id: str) -> List[MuxRule]: ) return muxes + async def get_persona_by_name(self, persona_name: str) -> Optional[Persona]: + """ + Get a persona by name. + """ + sql = text( + """ + SELECT + id, name, description + FROM personas + WHERE name = :name + """ + ) + conditions = {"name": persona_name} + personas = await self._exec_select_conditions_to_pydantic( + Persona, sql, conditions, should_raise=True + ) + return personas[0] if personas else None + + async def get_distance_to_persona( + self, persona_id: str, query_embedding: np.ndarray + ) -> PersonaDistance: + """ + Get the distance between a persona and a query embedding. + """ + sql = """ + SELECT + id, + name, + description, + vec_distance_cosine(description_embedding, :query_embedding) as distance + FROM personas + WHERE id = :id + """ + conditions = {"id": persona_id, "query_embedding": query_embedding} + persona_distance = await self._exec_vec_db_query_to_pydantic( + sql, conditions, PersonaDistance + ) + return persona_distance[0] + def init_db_sync(db_path: Optional[str] = None): """DB will be initialized in the constructor in case it doesn't exist.""" diff --git a/src/codegate/db/models.py b/src/codegate/db/models.py index 8f2365a0..a5941e96 100644 --- a/src/codegate/db/models.py +++ b/src/codegate/db/models.py @@ -2,7 +2,8 @@ from enum import Enum from typing import Annotated, Any, Dict, List, Optional -from pydantic import BaseModel, StringConstraints +import numpy as np +from pydantic import BaseModel, BeforeValidator, ConfigDict, PlainSerializer, StringConstraints class AlertSeverity(str, Enum): @@ -240,3 +241,58 @@ class MuxRule(BaseModel): priority: int created_at: Optional[datetime.datetime] = None updated_at: Optional[datetime.datetime] = None + + +def nd_array_custom_before_validator(x): + # custome before validation logic + return x + + +def nd_array_custom_serializer(x): + # custome serialization logic + return str(x) + + +# Pydantic doesn't support numpy arrays out of the box hence we need to construct a custom type. +# There are 2 things necessary for a Pydantic custom type: Validator and Serializer +# The lines below build our custom type +# Docs: https://docs.pydantic.dev/latest/concepts/types/#adding-validation-and-serialization +# Open Pydantic issue for npy support: https://github.com/pydantic/pydantic/issues/7017 +NdArray = Annotated[ + np.ndarray, + BeforeValidator(nd_array_custom_before_validator), + PlainSerializer(nd_array_custom_serializer, return_type=str), +] + + +class Persona(BaseModel): + """ + Represents a persona object. + """ + + id: str + name: str + description: str + + +class PersonaEmbedding(Persona): + """ + Represents a persona object with an embedding. + """ + + description_embedding: NdArray + + # Part of the workaround to allow numpy arrays in pydantic models + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class PersonaDistance(Persona): + """ + Result of an SQL query to get the distance between the query and the persona description. + + A vector similarity search is performed to get the distance. Distance values ranges [0, 2]. + 0 means the vectors are identical, 2 means they are orthogonal. + See [sqlite docs](https://alexgarcia.xyz/sqlite-vec/api-reference.html#vec_distance_cosine) + """ + + distance: float diff --git a/src/codegate/muxing/rulematcher.py b/src/codegate/muxing/rulematcher.py index 247e6c12..d41eb2ce 100644 --- a/src/codegate/muxing/rulematcher.py +++ b/src/codegate/muxing/rulematcher.py @@ -1,4 +1,5 @@ import copy +import fnmatch from abc import ABC, abstractmethod from asyncio import Lock from typing import Dict, List, Optional @@ -116,16 +117,16 @@ def _extract_request_filenames(self, detected_client: ClientType, data: dict) -> def _is_matcher_in_filenames(self, detected_client: ClientType, data: dict) -> bool: """ Check if the matcher is in the request filenames. + The matcher is treated as a glob pattern and matched against the filenames. """ # Empty matcher_blob means we match everything if not self._mux_rule.matcher: return True filenames_to_match = self._extract_request_filenames(detected_client, data) - # _mux_rule.matcher can be a filename or a file extension. We match if any of the filenames - # match the rule. + # _mux_rule.matcher is a glob pattern. We match if any of the filenames + # match the pattern. is_filename_match = any( - self._mux_rule.matcher == filename or filename.endswith(self._mux_rule.matcher) - for filename in filenames_to_match + fnmatch.fnmatch(filename, self._mux_rule.matcher) for filename in filenames_to_match ) return is_filename_match diff --git a/src/codegate/muxing/semantic_router.py b/src/codegate/muxing/semantic_router.py new file mode 100644 index 00000000..ce240b1f --- /dev/null +++ b/src/codegate/muxing/semantic_router.py @@ -0,0 +1,140 @@ +import unicodedata +import uuid + +import numpy as np +import regex as re +import structlog + +from codegate.config import Config +from codegate.db import models as db_models +from codegate.db.connection import DbReader, DbRecorder +from codegate.inference.inference_engine import LlamaCppInferenceEngine + +logger = structlog.get_logger("codegate") + + +REMOVE_URLS = re.compile(r"https?://\S+|www\.\S+") +REMOVE_EMAILS = re.compile(r"\S+@\S+") +REMOVE_CODE_BLOCKS = re.compile(r"```[\s\S]*?```") +REMOVE_INLINE_CODE = re.compile(r"`[^`]*`") +REMOVE_HTML_TAGS = re.compile(r"<[^>]+>") +REMOVE_PUNCTUATION = re.compile(r"[^\w\s\']") +NORMALIZE_WHITESPACE = re.compile(r"\s+") +NORMALIZE_DECIMAL_NUMBERS = re.compile(r"\b\d+\.\d+\b") +NORMALIZE_INTEGER_NUMBERS = re.compile(r"\b\d+\b") + + +class PersonaDoesNotExistError(Exception): + pass + + +class SemanticRouter: + + def __init__(self): + self._inference_engine = LlamaCppInferenceEngine() + conf = Config.get_config() + self._embeddings_model = f"{conf.model_base_path}/{conf.embedding_model}" + self._n_gpu = conf.chat_model_n_gpu_layers + self._persona_threshold = conf.persona_threshold + self._db_recorder = DbRecorder() + self._db_reader = DbReader() + + def _clean_text_for_embedding(self, text: str) -> str: + """ + Clean the text for embedding. This function should be used to preprocess the text + before embedding. + + Performs the following operations: + 1. Replaces newlines and carriage returns with spaces + 2. Removes extra whitespace + 3. Converts to lowercase + 4. Removes URLs and email addresses + 5. Removes code block markers and other markdown syntax + 6. Normalizes Unicode characters + 7. Handles special characters and punctuation + 8. Normalizes numbers + """ + if not text: + return "" + + # Replace newlines and carriage returns with spaces + text = text.replace("\n", " ").replace("\r", " ") + + # Normalize Unicode characters (e.g., convert accented characters to ASCII equivalents) + text = unicodedata.normalize("NFKD", text) + text = "".join([c for c in text if not unicodedata.combining(c)]) + + # Remove URLs + text = REMOVE_URLS.sub(" ", text) + + # Remove email addresses + text = REMOVE_EMAILS.sub(" ", text) + + # Remove code block markers and other markdown/code syntax + text = REMOVE_CODE_BLOCKS.sub(" ", text) + text = REMOVE_INLINE_CODE.sub(" ", text) + + # Remove HTML/XML tags + text = REMOVE_HTML_TAGS.sub(" ", text) + + # Normalize numbers (replace with placeholder) + text = NORMALIZE_DECIMAL_NUMBERS.sub(" NUM ", text) # Decimal numbers + text = NORMALIZE_INTEGER_NUMBERS.sub(" NUM ", text) # Integer numbers + + # Replace punctuation with spaces (keeping apostrophes for contractions) + text = REMOVE_PUNCTUATION.sub(" ", text) + + # Normalize whitespace (replace multiple spaces with a single space) + text = NORMALIZE_WHITESPACE.sub(" ", text) + + # Convert to lowercase and strip + text = text.strip() + + return text + + async def _embed_text(self, text: str) -> np.ndarray: + """ + Helper function to embed text using the inference engine. + """ + cleaned_text = self._clean_text_for_embedding(text) + # .embed returns a list of embeddings + embed_list = await self._inference_engine.embed( + self._embeddings_model, [cleaned_text], n_gpu_layers=self._n_gpu + ) + # Use only the first entry in the list and make sure we have the appropriate type + logger.debug("Text embedded in semantic routing", text=cleaned_text[:50]) + return np.array(embed_list[0], dtype=np.float32) + + async def add_persona(self, persona_name: str, persona_desc: str) -> None: + """ + Add a new persona to the database. The persona description is embedded + and stored in the database. + """ + emb_persona_desc = await self._embed_text(persona_desc) + new_persona = db_models.PersonaEmbedding( + id=str(uuid.uuid4()), + name=persona_name, + description=persona_desc, + description_embedding=emb_persona_desc, + ) + await self._db_recorder.add_persona(new_persona) + logger.info(f"Added persona {persona_name} to the database.") + + async def check_persona_match(self, persona_name: str, query: str) -> bool: + """ + Check if the query matches the persona description. A vector similarity + search is performed between the query and the persona description. + 0 means the vectors are identical, 2 means they are orthogonal. + See + [sqlite docs](https://alexgarcia.xyz/sqlite-vec/api-reference.html#vec_distance_cosine) + """ + persona = await self._db_reader.get_persona_by_name(persona_name) + if not persona: + raise PersonaDoesNotExistError(f"Persona {persona_name} does not exist.") + + emb_query = await self._embed_text(query) + persona_distance = await self._db_reader.get_distance_to_persona(persona.id, emb_query) + logger.info(f"Persona distance to {persona_name}", distance=persona_distance.distance) + if persona_distance.distance < self._persona_threshold: + return True + return False diff --git a/src/codegate/pipeline/base.py b/src/codegate/pipeline/base.py index 0baa322a..ddcd5a61 100644 --- a/src/codegate/pipeline/base.py +++ b/src/codegate/pipeline/base.py @@ -12,34 +12,23 @@ from codegate.clients.clients import ClientType from codegate.db.models import Alert, AlertSeverity, Output, Prompt from codegate.extract_snippets.message_extractor import CodeSnippet -from codegate.pipeline.secrets.manager import SecretsManager +from codegate.pipeline.sensitive_data.manager import SensitiveDataManager logger = structlog.get_logger("codegate") @dataclass class PipelineSensitiveData: - manager: SecretsManager + manager: SensitiveDataManager session_id: str - api_key: Optional[str] = None model: Optional[str] = None - provider: Optional[str] = None - api_base: Optional[str] = None def secure_cleanup(self): """Securely cleanup sensitive data for this session""" if self.manager is None or self.session_id == "": return - self.manager.cleanup_session(self.session_id) self.session_id = "" - - # Securely wipe the API key using the same method as secrets manager - if self.api_key is not None: - api_key_bytes = bytearray(self.api_key.encode()) - self.manager.crypto.wipe_bytearray(api_key_bytes) - self.api_key = None - self.model = None @@ -274,19 +263,19 @@ class InputPipelineInstance: def __init__( self, pipeline_steps: List[PipelineStep], - secret_manager: SecretsManager, + sensitive_data_manager: SensitiveDataManager, is_fim: bool, client: ClientType = ClientType.GENERIC, ): self.pipeline_steps = pipeline_steps - self.secret_manager = secret_manager + self.sensitive_data_manager = sensitive_data_manager self.is_fim = is_fim self.context = PipelineContext(client=client) # we create the sesitive context here so that it is not shared between individual requests # TODO: could we get away with just generating the session ID for an instance? self.context.sensitive = PipelineSensitiveData( - manager=self.secret_manager, + manager=self.sensitive_data_manager, session_id=str(uuid.uuid4()), ) self.context.metadata["is_fim"] = is_fim @@ -343,12 +332,12 @@ class SequentialPipelineProcessor: def __init__( self, pipeline_steps: List[PipelineStep], - secret_manager: SecretsManager, + sensitive_data_manager: SensitiveDataManager, client_type: ClientType, is_fim: bool, ): self.pipeline_steps = pipeline_steps - self.secret_manager = secret_manager + self.sensitive_data_manager = sensitive_data_manager self.is_fim = is_fim self.instance = self._create_instance(client_type) @@ -356,7 +345,7 @@ def _create_instance(self, client_type: ClientType) -> InputPipelineInstance: """Create a new pipeline instance for processing a request""" return InputPipelineInstance( self.pipeline_steps, - self.secret_manager, + self.sensitive_data_manager, self.is_fim, client_type, ) diff --git a/src/codegate/pipeline/comment/output.py b/src/codegate/pipeline/comment/output.py index 4583a659..3a17b551 100644 --- a/src/codegate/pipeline/comment/output.py +++ b/src/codegate/pipeline/comment/output.py @@ -12,7 +12,6 @@ ) from codegate.pipeline.base import PipelineContext from codegate.pipeline.output import OutputPipelineContext, OutputPipelineStep -from codegate.pipeline.suspicious_commands.suspicious_commands import check_suspicious_code from codegate.storage import StorageEngine from codegate.utils.package_extractor import PackageExtractor @@ -52,15 +51,16 @@ async def _snippet_comment(self, snippet: CodeSnippet, context: PipelineContext) """Create a comment for a snippet""" comment = "" - if ( - snippet.filepath is None - and snippet.file_extension is None - and "filepath" not in snippet.code - and "existing code" not in snippet.code - ): - new_comment, is_suspicious = await check_suspicious_code(snippet.code, snippet.language) - if is_suspicious: - comment += new_comment + # if ( + # snippet.filepath is None + # and snippet.file_extension is None + # and "filepath" not in snippet.code + # and "existing code" not in snippet.code + # ): + # new_comment, is_suspicious = await check_suspicious_code(snippet.code, + # snippet.language) + # if is_suspicious: + # comment += new_comment snippet.libraries = PackageExtractor.extract_packages(snippet.code, snippet.language) diff --git a/src/codegate/pipeline/factory.py b/src/codegate/pipeline/factory.py index acde51b4..813459d5 100644 --- a/src/codegate/pipeline/factory.py +++ b/src/codegate/pipeline/factory.py @@ -12,18 +12,18 @@ PiiRedactionNotifier, PiiUnRedactionStep, ) -from codegate.pipeline.secrets.manager import SecretsManager from codegate.pipeline.secrets.secrets import ( CodegateSecrets, SecretRedactionNotifier, SecretUnredactionStep, ) +from codegate.pipeline.sensitive_data.manager import SensitiveDataManager from codegate.pipeline.system_prompt.codegate import SystemPrompt class PipelineFactory: - def __init__(self, secrets_manager: SecretsManager): - self.secrets_manager = secrets_manager + def __init__(self, sensitive_data_manager: SensitiveDataManager): + self.sensitive_data_manager = sensitive_data_manager def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelineProcessor: input_steps: List[PipelineStep] = [ @@ -32,7 +32,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr # and without obfuscating the secrets, we'd leak the secrets during those # later steps CodegateSecrets(), - CodegatePii(), + CodegatePii(self.sensitive_data_manager), CodegateCli(), CodegateContextRetriever(), SystemPrompt( @@ -41,7 +41,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr ] return SequentialPipelineProcessor( input_steps, - self.secrets_manager, + self.sensitive_data_manager, client_type, is_fim=False, ) @@ -49,11 +49,11 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr def create_fim_pipeline(self, client_type: ClientType) -> SequentialPipelineProcessor: fim_steps: List[PipelineStep] = [ CodegateSecrets(), - CodegatePii(), + CodegatePii(self.sensitive_data_manager), ] return SequentialPipelineProcessor( fim_steps, - self.secrets_manager, + self.sensitive_data_manager, client_type, is_fim=True, ) diff --git a/src/codegate/pipeline/pii/analyzer.py b/src/codegate/pipeline/pii/analyzer.py index a1ed5bed..96442824 100644 --- a/src/codegate/pipeline/pii/analyzer.py +++ b/src/codegate/pipeline/pii/analyzer.py @@ -1,5 +1,4 @@ -import uuid -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional import structlog from presidio_analyzer import AnalyzerEngine @@ -7,41 +6,11 @@ from codegate.db.models import AlertSeverity from codegate.pipeline.base import PipelineContext +from codegate.pipeline.sensitive_data.session_store import SessionStore logger = structlog.get_logger("codegate.pii.analyzer") -class PiiSessionStore: - """ - A class to manage PII (Personally Identifiable Information) session storage. - - Attributes: - session_id (str): The unique identifier for the session. If not provided, a new UUID - is generated. mappings (Dict[str, str]): A dictionary to store mappings between UUID - placeholders and PII. - - Methods: - add_mapping(pii: str) -> str: - Adds a PII string to the session store and returns a UUID placeholder for it. - - get_pii(uuid_placeholder: str) -> str: - Retrieves the PII string associated with the given UUID placeholder. If the placeholder - is not found, returns the placeholder itself. - """ - - def __init__(self, session_id: str = None): - self.session_id = session_id or str(uuid.uuid4()) - self.mappings: Dict[str, str] = {} - - def add_mapping(self, pii: str) -> str: - uuid_placeholder = f"<{str(uuid.uuid4())}>" - self.mappings[uuid_placeholder] = pii - return uuid_placeholder - - def get_pii(self, uuid_placeholder: str) -> str: - return self.mappings.get(uuid_placeholder, uuid_placeholder) - - class PiiAnalyzer: """ PiiAnalyzer class for analyzing and anonymizing text containing PII. @@ -52,12 +21,12 @@ class PiiAnalyzer: Get or create the singleton instance of PiiAnalyzer. analyze: text (str): The text to analyze for PII. - Tuple[str, List[Dict[str, Any]], PiiSessionStore]: The anonymized text, a list of + Tuple[str, List[Dict[str, Any]], SessionStore]: The anonymized text, a list of found PII details, and the session store. entities (List[str]): The PII entities to analyze for. restore_pii: anonymized_text (str): The text with anonymized PII. - session_store (PiiSessionStore): The PiiSessionStore used for anonymization. + session_store (SessionStore): The SessionStore used for anonymization. str: The text with original PII restored. """ @@ -95,13 +64,11 @@ def __init__(self): # Create analyzer with custom NLP engine self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine) self.anonymizer = AnonymizerEngine() - self.session_store = PiiSessionStore() + self.session_store = SessionStore() PiiAnalyzer._instance = self - def analyze( - self, text: str, context: Optional[PipelineContext] = None - ) -> Tuple[str, List[Dict[str, Any]], PiiSessionStore]: + def analyze(self, text: str, context: Optional[PipelineContext] = None) -> List: # Prioritize credit card detection first entities = [ "PHONE_NUMBER", @@ -125,81 +92,30 @@ def analyze( language="en", score_threshold=0.3, # Lower threshold to catch more potential matches ) + return analyzer_results - # Track found PII - found_pii = [] - - # Only anonymize if PII was found - if analyzer_results: - # Log each found PII instance and anonymize - anonymized_text = text - for result in analyzer_results: - pii_value = text[result.start : result.end] - uuid_placeholder = self.session_store.add_mapping(pii_value) - pii_info = { - "type": result.entity_type, - "value": pii_value, - "score": result.score, - "start": result.start, - "end": result.end, - "uuid_placeholder": uuid_placeholder, - } - found_pii.append(pii_info) - anonymized_text = anonymized_text.replace(pii_value, uuid_placeholder) - - # Log each PII detection with its UUID mapping - logger.info( - "PII detected and mapped", - pii_type=result.entity_type, - score=f"{result.score:.2f}", - uuid=uuid_placeholder, - # Don't log the actual PII value for security - value_length=len(pii_value), - session_id=self.session_store.session_id, - ) - - # Log summary of all PII found in this analysis - if found_pii and context: - # Create notification string for alert - notify_string = ( - f"**PII Detected** 🔒\n" - f"- Total PII Found: {len(found_pii)}\n" - f"- Types Found: {', '.join(set(p['type'] for p in found_pii))}\n" - ) - context.add_alert( - self._name, - trigger_string=notify_string, - severity_category=AlertSeverity.CRITICAL, - ) - - logger.info( - "PII analysis complete", - total_pii_found=len(found_pii), - pii_types=[p["type"] for p in found_pii], - session_id=self.session_store.session_id, - ) - - # Return the anonymized text, PII details, and session store - return anonymized_text, found_pii, self.session_store - - # If no PII found, return original text, empty list, and session store - return text, [], self.session_store - - def restore_pii(self, anonymized_text: str, session_store: PiiSessionStore) -> str: + def restore_pii(self, session_id: str, anonymized_text: str) -> str: """ Restore the original PII (Personally Identifiable Information) in the given anonymized text. This method replaces placeholders in the anonymized text with their corresponding original - PII values using the mappings stored in the provided PiiSessionStore. + PII values using the mappings stored in the provided SessionStore. Args: anonymized_text (str): The text containing placeholders for PII. - session_store (PiiSessionStore): The session store containing mappings of placeholders + session_id (str): The session id containing mappings of placeholders to original PII. Returns: str: The text with the original PII restored. """ - for uuid_placeholder, original_pii in session_store.mappings.items(): + session_data = self.session_store.get_by_session_id(session_id) + if not session_data: + logger.warning( + "No active PII session found for given session ID. Unable to restore PII." + ) + return anonymized_text + + for uuid_placeholder, original_pii in session_data.items(): anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii) return anonymized_text diff --git a/src/codegate/pipeline/pii/manager.py b/src/codegate/pipeline/pii/manager.py deleted file mode 100644 index 54112713..00000000 --- a/src/codegate/pipeline/pii/manager.py +++ /dev/null @@ -1,84 +0,0 @@ -from typing import Any, Dict, List, Optional, Tuple - -import structlog - -from codegate.pipeline.base import PipelineContext -from codegate.pipeline.pii.analyzer import PiiAnalyzer, PiiSessionStore - -logger = structlog.get_logger("codegate") - - -class PiiManager: - """ - Manages the analysis and restoration of Personally Identifiable Information - (PII) in text. - - Attributes: - analyzer (PiiAnalyzer): The singleton instance of PiiAnalyzer used for - PII detection and restoration. - session_store (PiiSessionStore): The session store for the current PII session. - - Methods: - __init__(): - Initializes the PiiManager with the singleton PiiAnalyzer instance and sets the - session store. - - analyze(text: str) -> Tuple[str, List[Dict[str, Any]]]: - Analyzes the given text for PII, anonymizes it, and logs the detected PII details. - Args: - text (str): The text to be analyzed for PII. - Returns: - Tuple[str, List[Dict[str, Any]]]: A tuple containing the anonymized text and - a list of found PII details. - - restore_pii(anonymized_text: str) -> str: - Restores the PII in the given anonymized text using the current session. - Args: - anonymized_text (str): The text with anonymized PII to be restored. - Returns: - str: The text with restored PII. - """ - - def __init__(self): - """ - Initialize the PiiManager with the singleton PiiAnalyzer instance. - """ - self.analyzer = PiiAnalyzer.get_instance() - # Always use the analyzer's session store - self._session_store = self.analyzer.session_store - - @property - def session_store(self) -> PiiSessionStore: - """Get the current session store.""" - # Always return the analyzer's current session store - return self.analyzer.session_store - - def analyze( - self, text: str, context: Optional[PipelineContext] = None - ) -> Tuple[str, List[Dict[str, Any]]]: - # Call analyzer and get results - anonymized_text, found_pii, _ = self.analyzer.analyze(text, context=context) - - # Log found PII details (without modifying the found_pii list) - if found_pii: - for pii in found_pii: - logger.info( - "PII detected", - pii_type=pii["type"], - value="*" * len(pii["value"]), # Don't log actual value - score=f"{pii['score']:.2f}", - ) - - # Return the exact same objects we got from the analyzer - return anonymized_text, found_pii - - def restore_pii(self, anonymized_text: str) -> str: - """ - Restore PII in the given anonymized text using the current session. - """ - if self.session_store is None: - logger.warning("No active PII session found. Unable to restore PII.") - return anonymized_text - - # Use the analyzer's restore_pii method with the current session store - return self.analyzer.restore_pii(anonymized_text, self.session_store) diff --git a/src/codegate/pipeline/pii/pii.py b/src/codegate/pipeline/pii/pii.py index f0b9f271..fde89428 100644 --- a/src/codegate/pipeline/pii/pii.py +++ b/src/codegate/pipeline/pii/pii.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple +import uuid import regex as re import structlog @@ -6,13 +7,15 @@ from litellm.types.utils import Delta, StreamingChoices from codegate.config import Config +from codegate.db.models import AlertSeverity from codegate.pipeline.base import ( PipelineContext, PipelineResult, PipelineStep, ) from codegate.pipeline.output import OutputPipelineContext, OutputPipelineStep -from codegate.pipeline.pii.manager import PiiManager +from codegate.pipeline.pii.analyzer import PiiAnalyzer +from codegate.pipeline.sensitive_data.manager import SensitiveData, SensitiveDataManager from codegate.pipeline.systemmsg import add_or_update_system_message logger = structlog.get_logger("codegate") @@ -25,7 +28,7 @@ class CodegatePii(PipelineStep): Methods: __init__: - Initializes the CodegatePii pipeline step and sets up the PiiManager. + Initializes the CodegatePii pipeline step and sets up the SensitiveDataManager. name: Returns the name of the pipeline step. @@ -37,14 +40,15 @@ class CodegatePii(PipelineStep): Processes the chat completion request to detect and redact PII. Updates the request with anonymized text and stores PII details in the context metadata. - restore_pii(anonymized_text: str) -> str: - Restores the original PII from the anonymized text using the PiiManager. + restore_pii(session_id: str, anonymized_text: str) -> str: + Restores the original PII from the anonymized text using the SensitiveDataManager. """ - def __init__(self): + def __init__(self, sensitive_data_manager: SensitiveDataManager): """Initialize the CodegatePii pipeline step.""" super().__init__() - self.pii_manager = PiiManager() + self.sensitive_data_manager = sensitive_data_manager + self.analyzer = PiiAnalyzer.get_instance() @property def name(self) -> str: @@ -65,6 +69,68 @@ def _get_redacted_snippet(self, message: str, pii_details: List[Dict[str, Any]]) return message[start:end] + def process_results( + self, session_id: str, text: str, results: List, context: PipelineContext + ) -> Tuple[List, str]: + # Track found PII + found_pii = [] + + # Log each found PII instance and anonymize + anonymized_text = text + for result in results: + pii_value = text[result.start : result.end] + + # add to session store + obj = SensitiveData(original=pii_value, service="pii", type=result.entity_type) + uuid_placeholder = self.sensitive_data_manager.store(session_id, obj) + anonymized_text = anonymized_text.replace(pii_value, uuid_placeholder) + + # Add to found PII list + pii_info = { + "type": result.entity_type, + "value": pii_value, + "score": result.score, + "start": result.start, + "end": result.end, + "uuid_placeholder": uuid_placeholder, + } + found_pii.append(pii_info) + + # Log each PII detection with its UUID mapping + logger.info( + "PII detected and mapped", + pii_type=result.entity_type, + score=f"{result.score:.2f}", + uuid=uuid_placeholder, + # Don't log the actual PII value for security + value_length=len(pii_value), + session_id=session_id, + ) + + # Log summary of all PII found in this analysis + if found_pii and context: + # Create notification string for alert + notify_string = ( + f"**PII Detected** 🔒\n" + f"- Total PII Found: {len(found_pii)}\n" + f"- Types Found: {', '.join(set(p['type'] for p in found_pii))}\n" + ) + context.add_alert( + self.name, + trigger_string=notify_string, + severity_category=AlertSeverity.CRITICAL, + ) + + logger.info( + "PII analysis complete", + total_pii_found=len(found_pii), + pii_types=[p["type"] for p in found_pii], + session_id=session_id, + ) + + # Return the anonymized text, PII details, and session store + return found_pii, anonymized_text + async def process( self, request: ChatCompletionRequest, context: PipelineContext ) -> PipelineResult: @@ -75,23 +141,28 @@ async def process( total_pii_found = 0 all_pii_details: List[Dict[str, Any]] = [] last_redacted_text = "" + session_id = context.sensitive.session_id for i, message in enumerate(new_request["messages"]): if "content" in message and message["content"]: # This is where analyze and anonymize the text original_text = str(message["content"]) - anonymized_text, pii_details = self.pii_manager.analyze(original_text, context) - - if pii_details: - total_pii_found += len(pii_details) - all_pii_details.extend(pii_details) - new_request["messages"][i]["content"] = anonymized_text - - # If this is a user message, grab the redacted snippet! - if message.get("role") == "user": - last_redacted_text = self._get_redacted_snippet( - anonymized_text, pii_details - ) + results = self.analyzer.analyze(original_text, context) + if results: + pii_details, anonymized_text = self.process_results( + session_id, original_text, results, context + ) + + if pii_details: + total_pii_found += len(pii_details) + all_pii_details.extend(pii_details) + new_request["messages"][i]["content"] = anonymized_text + + # If this is a user message, grab the redacted snippet! + if message.get("role") == "user": + last_redacted_text = self._get_redacted_snippet( + anonymized_text, pii_details + ) logger.info(f"Total PII instances redacted: {total_pii_found}") @@ -99,9 +170,10 @@ async def process( context.metadata["redacted_pii_count"] = total_pii_found context.metadata["redacted_pii_details"] = all_pii_details context.metadata["redacted_text"] = last_redacted_text + context.metadata["session_id"] = session_id if total_pii_found > 0: - context.metadata["pii_manager"] = self.pii_manager + context.metadata["sensitive_data_manager"] = self.sensitive_data_manager system_message = ChatCompletionSystemMessage( content=Config.get_config().prompts.pii_redacted, @@ -113,8 +185,31 @@ async def process( return PipelineResult(request=new_request, context=context) - def restore_pii(self, anonymized_text: str) -> str: - return self.pii_manager.restore_pii(anonymized_text) + def restore_pii(self, session_id: str, anonymized_text: str) -> str: + """ + Restore the original PII (Personally Identifiable Information) in the given anonymized text. + + This method replaces placeholders in the anonymized text with their corresponding original + PII values using the mappings stored in the provided SessionStore. + + Args: + anonymized_text (str): The text containing placeholders for PII. + session_id (str): The session id containing mappings of placeholders + to original PII. + + Returns: + str: The text with the original PII restored. + """ + session_data = self.sensitive_data_manager.get_by_session_id(session_id) + if not session_data: + logger.warning( + "No active PII session found for given session ID. Unable to restore PII." + ) + return anonymized_text + + for uuid_placeholder, original_pii in session_data.items(): + anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii) + return anonymized_text class PiiUnRedactionStep(OutputPipelineStep): @@ -136,12 +231,12 @@ class PiiUnRedactionStep(OutputPipelineStep): """ def __init__(self): - self.redacted_pattern = re.compile(r"<([0-9a-f-]{0,36})>") + self.redacted_pattern = re.compile(r"#([0-9a-f-]{0,36})#") self.complete_uuid_pattern = re.compile( r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" ) # noqa: E501 - self.marker_start = "<" - self.marker_end = ">" + self.marker_start = "#" + self.marker_end = "#" @property def name(self) -> str: @@ -151,7 +246,7 @@ def _is_complete_uuid(self, uuid_str: str) -> bool: """Check if the string is a complete UUID""" return bool(self.complete_uuid_pattern.match(uuid_str)) - async def process_chunk( + async def process_chunk( # noqa: C901 self, chunk: ModelResponse, context: OutputPipelineContext, @@ -162,6 +257,10 @@ async def process_chunk( return [chunk] content = chunk.choices[0].delta.content + session_id = input_context.sensitive.session_id + if not session_id: + logger.error("Could not get any session id, cannot process pii") + return [chunk] # Add current chunk to buffer if context.prefix_buffer: @@ -172,13 +271,13 @@ async def process_chunk( current_pos = 0 result = [] while current_pos < len(content): - start_idx = content.find("<", current_pos) + start_idx = content.find(self.marker_start, current_pos) if start_idx == -1: # No more markers!, add remaining content result.append(content[current_pos:]) break - end_idx = content.find(">", start_idx) + end_idx = content.find(self.marker_end, start_idx + 1) if end_idx == -1: # Incomplete marker, buffer the rest context.prefix_buffer = content[current_pos:] @@ -190,16 +289,18 @@ async def process_chunk( # Extract potential UUID if it's a valid format! uuid_marker = content[start_idx : end_idx + 1] - uuid_value = uuid_marker[1:-1] # Remove < > + uuid_value = uuid_marker[1:-1] # Remove # # if self._is_complete_uuid(uuid_value): # Get the PII manager from context metadata logger.debug(f"Valid UUID found: {uuid_value}") - pii_manager = input_context.metadata.get("pii_manager") if input_context else None - if pii_manager and pii_manager.session_store: + sensitive_data_manager = ( + input_context.metadata.get("sensitive_data_manager") if input_context else None + ) + if sensitive_data_manager and sensitive_data_manager.session_store: # Restore original value from PII manager logger.debug("Attempting to restore PII from UUID marker") - original = pii_manager.session_store.get_pii(uuid_marker) + original = sensitive_data_manager.get_original_value(session_id, uuid_marker) logger.debug(f"Restored PII: {original}") result.append(original) else: diff --git a/src/codegate/pipeline/secrets/gatecrypto.py b/src/codegate/pipeline/secrets/gatecrypto.py deleted file mode 100644 index 859b025d..00000000 --- a/src/codegate/pipeline/secrets/gatecrypto.py +++ /dev/null @@ -1,111 +0,0 @@ -import os -import time -from base64 import b64decode, b64encode - -import structlog -from cryptography.hazmat.primitives.ciphers.aead import AESGCM - -logger = structlog.get_logger("codegate") - - -class CodeGateCrypto: - """ - Manage session keys and provide encryption / decryption of tokens with replay protection. - Attributes: - session_keys (dict): A dictionary to store session keys with their associated timestamps. - SESSION_KEY_LIFETIME (int): The lifetime of a session key in seconds. - NONCE_SIZE (int): The size of the nonce used in AES GCM mode. - Methods: - generate_session_key(session_id): - Generates a session key with an associated timestamp. - get_session_key(session_id): - Retrieves a session key if it is still valid. - cleanup_expired_keys(): - Removes expired session keys from memory. - encrypt_token(token, session_id): - Encrypts a token with a session key and adds a timestamp for replay protection. - decrypt_token(encrypted_token, session_id): - Decrypts a token and validates its timestamp to prevent replay attacks. - wipe_bytearray(data): - Securely wipes a bytearray in-place. - """ - - def __init__(self): - self.session_keys = {} - self.SESSION_KEY_LIFETIME = 600 # 10 minutes - self.NONCE_SIZE = 12 # AES GCM recommended nonce size - - def generate_session_key(self, session_id): - """Generates a session key with an associated timestamp.""" - key = os.urandom(32) # Generate a 256-bit key - self.session_keys[session_id] = (key, time.time()) - return key - - def get_session_key(self, session_id): - """Retrieves a session key if it is still valid.""" - key_data = self.session_keys.get(session_id) - if key_data: - key, timestamp = key_data - if time.time() - timestamp < self.SESSION_KEY_LIFETIME: - return key - else: - # Key has expired - del self.session_keys[session_id] - return None - - def cleanup_expired_keys(self): - """Removes expired session keys from memory.""" - now = time.time() - expired_keys = [ - session_id - for session_id, (key, timestamp) in self.session_keys.items() - if now - timestamp >= self.SESSION_KEY_LIFETIME - ] - for session_id in expired_keys: - del self.session_keys[session_id] - - def encrypt_token(self, token, session_id): - """Encrypts a token with a session key and adds a timestamp for replay protection.""" - key = self.generate_session_key(session_id) - nonce = os.urandom(self.NONCE_SIZE) - timestamp = int(time.time()) - data = f"{token}:{timestamp}".encode() # Append timestamp to token - - aesgcm = AESGCM(key) - ciphertext = aesgcm.encrypt(nonce, data, None) # None for no associated data - - # Combine nonce and ciphertext (which includes the authentication tag) - encrypted_token = b64encode(nonce + ciphertext).decode() - return encrypted_token - - def decrypt_token(self, encrypted_token, session_id): - """Decrypts a token and validates its timestamp to prevent replay attacks.""" - key = self.get_session_key(session_id) - if not key: - raise ValueError("Session key expired or invalid.") - - encrypted_data = b64decode(encrypted_token) - nonce = encrypted_data[: self.NONCE_SIZE] - ciphertext = encrypted_data[self.NONCE_SIZE :] # Includes authentication tag - - aesgcm = AESGCM(key) - try: - decrypted_data = aesgcm.decrypt( - nonce, ciphertext, None - ).decode() # None for no associated data - except Exception as e: - raise ValueError("Decryption failed: Invalid token or tampering detected.") from e - - token, timestamp = decrypted_data.rsplit(":", 1) - if time.time() - int(timestamp) > self.SESSION_KEY_LIFETIME: - raise ValueError("Token has expired.") - - return token - - def wipe_bytearray(self, data): - """Securely wipes a bytearray in-place.""" - if not isinstance(data, bytearray): - raise ValueError("Only bytearray objects can be securely wiped.") - for i in range(len(data)): - data[i] = 0 # Overwrite each byte with 0 - logger.info("Sensitive data securely wiped from memory.") diff --git a/src/codegate/pipeline/secrets/manager.py b/src/codegate/pipeline/secrets/manager.py deleted file mode 100644 index bef07c75..00000000 --- a/src/codegate/pipeline/secrets/manager.py +++ /dev/null @@ -1,117 +0,0 @@ -from typing import NamedTuple, Optional - -import structlog - -from codegate.pipeline.secrets.gatecrypto import CodeGateCrypto - -logger = structlog.get_logger("codegate") - - -class SecretEntry(NamedTuple): - """Represents a stored secret""" - - original: str - encrypted: str - service: str - secret_type: str - - -class SecretsManager: - """Manages encryption, storage and retrieval of secrets""" - - def __init__(self): - self.crypto = CodeGateCrypto() - self._session_store: dict[str, dict[str, SecretEntry]] = {} - self._encrypted_to_session: dict[str, str] = {} # Reverse lookup index - - def store_secret(self, value: str, service: str, secret_type: str, session_id: str) -> str: - """ - Encrypts and stores a secret value. - Returns the encrypted value. - """ - if not value: - raise ValueError("Value must be provided") - if not service: - raise ValueError("Service must be provided") - if not secret_type: - raise ValueError("Secret type must be provided") - if not session_id: - raise ValueError("Session ID must be provided") - - encrypted_value = self.crypto.encrypt_token(value, session_id) - - # Store mappings - session_secrets = self._session_store.get(session_id, {}) - session_secrets[encrypted_value] = SecretEntry( - original=value, - encrypted=encrypted_value, - service=service, - secret_type=secret_type, - ) - self._session_store[session_id] = session_secrets - self._encrypted_to_session[encrypted_value] = session_id - - logger.debug("Stored secret", service=service, type=secret_type, encrypted=encrypted_value) - - return encrypted_value - - def get_original_value(self, encrypted_value: str, session_id: str) -> Optional[str]: - """Retrieve original value for an encrypted value""" - try: - stored_session_id = self._encrypted_to_session.get(encrypted_value) - if stored_session_id == session_id: - session_secrets = self._session_store[session_id].get(encrypted_value) - if session_secrets: - return session_secrets.original - except Exception as e: - logger.error("Error retrieving secret", error=str(e)) - return None - - def get_by_session_id(self, session_id: str) -> Optional[SecretEntry]: - """Get stored data by session ID""" - return self._session_store.get(session_id) - - def cleanup(self): - """Securely wipe sensitive data""" - try: - # Convert and wipe original values - for secrets in self._session_store.values(): - for entry in secrets.values(): - original_bytes = bytearray(entry.original.encode()) - self.crypto.wipe_bytearray(original_bytes) - - # Clear the dictionaries - self._session_store.clear() - self._encrypted_to_session.clear() - - logger.info("Secrets manager data securely wiped") - except Exception as e: - logger.error("Error during secure cleanup", error=str(e)) - - def cleanup_session(self, session_id: str): - """ - Remove a specific session's secrets and perform secure cleanup. - - Args: - session_id (str): The session identifier to remove - """ - try: - # Get the secret entry for the session - secrets = self._session_store.get(session_id, {}) - - for entry in secrets.values(): - # Securely wipe the original value - original_bytes = bytearray(entry.original.encode()) - self.crypto.wipe_bytearray(original_bytes) - - # Remove the encrypted value from the reverse lookup index - self._encrypted_to_session.pop(entry.encrypted, None) - - # Remove the session from the store - self._session_store.pop(session_id, None) - - logger.debug("Session secrets securely removed", session_id=session_id) - else: - logger.debug("No secrets found for session", session_id=session_id) - except Exception as e: - logger.error("Error during session cleanup", session_id=session_id, error=str(e)) diff --git a/src/codegate/pipeline/secrets/secrets.py b/src/codegate/pipeline/secrets/secrets.py index 184c3ba3..527c817f 100644 --- a/src/codegate/pipeline/secrets/secrets.py +++ b/src/codegate/pipeline/secrets/secrets.py @@ -16,8 +16,8 @@ PipelineStep, ) from codegate.pipeline.output import OutputPipelineContext, OutputPipelineStep -from codegate.pipeline.secrets.manager import SecretsManager from codegate.pipeline.secrets.signatures import CodegateSignatures, Match +from codegate.pipeline.sensitive_data.manager import SensitiveData, SensitiveDataManager from codegate.pipeline.systemmsg import add_or_update_system_message logger = structlog.get_logger("codegate") @@ -171,25 +171,35 @@ def obfuscate(self, text: str, snippet: Optional[CodeSnippet]) -> tuple[str, Lis class SecretsEncryptor(SecretsModifier): def __init__( self, - secrets_manager: SecretsManager, + sensitive_data_manager: SensitiveDataManager, context: PipelineContext, session_id: str, ): - self._secrets_manager = secrets_manager + self._sensitive_data_manager = sensitive_data_manager self._session_id = session_id self._context = context self._name = "codegate-secrets" + super().__init__() def _hide_secret(self, match: Match) -> str: # Encrypt and store the value - encrypted_value = self._secrets_manager.store_secret( - match.value, - match.service, - match.type, - self._session_id, + if not self._session_id: + raise ValueError("Session id must be provided") + + if not match.value: + raise ValueError("Value must be provided") + if not match.service: + raise ValueError("Service must be provided") + if not match.type: + raise ValueError("Secret type must be provided") + + obj = SensitiveData(original=match.value, service=match.service, type=match.type) + uuid_placeholder = self._sensitive_data_manager.store(self._session_id, obj) + logger.debug( + "Stored secret", service=match.service, type=match.type, placeholder=uuid_placeholder ) - return f"REDACTED<${encrypted_value}>" + return f"REDACTED<{uuid_placeholder}>" def _notify_secret( self, match: Match, code_snippet: Optional[CodeSnippet], protected_text: List[str] @@ -251,7 +261,7 @@ def _redact_text( self, text: str, snippet: Optional[CodeSnippet], - secrets_manager: SecretsManager, + sensitive_data_manager: SensitiveDataManager, session_id: str, context: PipelineContext, ) -> tuple[str, List[Match]]: @@ -260,14 +270,14 @@ def _redact_text( Args: text: The text to protect - secrets_manager: .. + sensitive_data_manager: .. session_id: .. context: The pipeline context to be able to log alerts Returns: Tuple containing protected text with encrypted values and the count of redacted secrets """ # Find secrets in the text - text_encryptor = SecretsEncryptor(secrets_manager, context, session_id) + text_encryptor = SecretsEncryptor(sensitive_data_manager, context, session_id) return text_encryptor.obfuscate(text, snippet) async def process( @@ -287,8 +297,10 @@ async def process( if "messages" not in request: return PipelineResult(request=request, context=context) - secrets_manager = context.sensitive.manager - if not secrets_manager or not isinstance(secrets_manager, SecretsManager): + sensitive_data_manager = context.sensitive.manager + if not sensitive_data_manager or not isinstance( + sensitive_data_manager, SensitiveDataManager + ): raise ValueError("Secrets manager not found in context") session_id = context.sensitive.session_id if not session_id: @@ -305,7 +317,7 @@ async def process( for i, message in enumerate(new_request["messages"]): if "content" in message and message["content"]: redacted_content, secrets_matched = self._redact_message_content( - message["content"], secrets_manager, session_id, context + message["content"], sensitive_data_manager, session_id, context ) new_request["messages"][i]["content"] = redacted_content if i > last_assistant_idx: @@ -313,7 +325,7 @@ async def process( new_request = self._finalize_redaction(context, total_matches, new_request) return PipelineResult(request=new_request, context=context) - def _redact_message_content(self, message_content, secrets_manager, session_id, context): + def _redact_message_content(self, message_content, sensitive_data_manager, session_id, context): # Extract any code snippets extractor = MessageCodeExtractorFactory.create_snippet_extractor(context.client) snippets = extractor.extract_snippets(message_content) @@ -322,7 +334,7 @@ def _redact_message_content(self, message_content, secrets_manager, session_id, for snippet in snippets: redacted_snippet, secrets_matched = self._redact_text( - snippet, snippet, secrets_manager, session_id, context + snippet, snippet, sensitive_data_manager, session_id, context ) redacted_snippets[snippet.code] = redacted_snippet total_matches.extend(secrets_matched) @@ -336,7 +348,7 @@ def _redact_message_content(self, message_content, secrets_manager, session_id, if start_index > last_end: non_snippet_part = message_content[last_end:start_index] redacted_part, secrets_matched = self._redact_text( - non_snippet_part, "", secrets_manager, session_id, context + non_snippet_part, "", sensitive_data_manager, session_id, context ) non_snippet_parts.append(redacted_part) total_matches.extend(secrets_matched) @@ -347,7 +359,7 @@ def _redact_message_content(self, message_content, secrets_manager, session_id, if last_end < len(message_content): remaining_text = message_content[last_end:] redacted_remaining, secrets_matched = self._redact_text( - remaining_text, "", secrets_manager, session_id, context + remaining_text, "", sensitive_data_manager, session_id, context ) non_snippet_parts.append(redacted_remaining) total_matches.extend(secrets_matched) @@ -428,9 +440,14 @@ async def process_chunk( encrypted_value = match.group(1) if encrypted_value.startswith("$"): encrypted_value = encrypted_value[1:] + + session_id = input_context.sensitive.session_id + if not session_id: + raise ValueError("Session ID not found in context") + original_value = input_context.sensitive.manager.get_original_value( + session_id, encrypted_value, - input_context.sensitive.session_id, ) if original_value is None: diff --git a/src/codegate/pipeline/sensitive_data/manager.py b/src/codegate/pipeline/sensitive_data/manager.py new file mode 100644 index 00000000..89506d15 --- /dev/null +++ b/src/codegate/pipeline/sensitive_data/manager.py @@ -0,0 +1,50 @@ +import json +from typing import Dict, Optional +import pydantic +import structlog +from codegate.pipeline.sensitive_data.session_store import SessionStore + +logger = structlog.get_logger("codegate") + + +class SensitiveData(pydantic.BaseModel): + """Represents sensitive data with additional metadata.""" + + original: str + service: Optional[str] = None + type: Optional[str] = None + + +class SensitiveDataManager: + """Manages encryption, storage, and retrieval of secrets""" + + def __init__(self): + self.session_store = SessionStore() + + def store(self, session_id: str, value: SensitiveData) -> Optional[str]: + if not session_id or not value.original: + return None + return self.session_store.add_mapping(session_id, value.model_dump_json()) + + def get_by_session_id(self, session_id: str) -> Optional[Dict]: + if not session_id: + return None + data = self.session_store.get_by_session_id(session_id) + return SensitiveData.model_validate_json(data) if data else None + + def get_original_value(self, session_id: str, uuid_placeholder: str) -> Optional[str]: + if not session_id: + return None + secret_entry_json = self.session_store.get_mapping(session_id, uuid_placeholder) + return ( + SensitiveData.model_validate_json(secret_entry_json).original + if secret_entry_json + else None + ) + + def cleanup_session(self, session_id: str): + if session_id: + self.session_store.cleanup_session(session_id) + + def cleanup(self): + self.session_store.cleanup() diff --git a/src/codegate/pipeline/sensitive_data/session_store.py b/src/codegate/pipeline/sensitive_data/session_store.py new file mode 100644 index 00000000..5e508847 --- /dev/null +++ b/src/codegate/pipeline/sensitive_data/session_store.py @@ -0,0 +1,33 @@ +from typing import Dict, Optional +import uuid + + +class SessionStore: + """ + A generic session store for managing data protection. + """ + + def __init__(self): + self.sessions: Dict[str, Dict[str, str]] = {} + + def add_mapping(self, session_id: str, data: str) -> str: + uuid_placeholder = f"#{str(uuid.uuid4())}#" + if session_id not in self.sessions: + self.sessions[session_id] = {} + self.sessions[session_id][uuid_placeholder] = data + return uuid_placeholder + + def get_by_session_id(self, session_id: str) -> Optional[Dict]: + return self.sessions.get(session_id, None) + + def get_mapping(self, session_id: str, uuid_placeholder: str) -> Optional[str]: + return self.sessions.get(session_id, {}).get(uuid_placeholder) + + def cleanup_session(self, session_id: str): + """Clears all stored mappings for a specific session.""" + if session_id in self.sessions: + del self.sessions[session_id] + + def cleanup(self): + """Clears all stored mappings for all sessions.""" + self.sessions.clear() diff --git a/src/codegate/providers/copilot/provider.py b/src/codegate/providers/copilot/provider.py index b17e98a8..20ac43f9 100644 --- a/src/codegate/providers/copilot/provider.py +++ b/src/codegate/providers/copilot/provider.py @@ -17,7 +17,7 @@ from codegate.pipeline.base import PipelineContext from codegate.pipeline.factory import PipelineFactory from codegate.pipeline.output import OutputPipelineInstance -from codegate.pipeline.secrets.manager import SecretsManager +from codegate.pipeline.sensitive_data.manager import SensitiveDataManager from codegate.providers.copilot.mapping import PIPELINE_ROUTES, VALIDATED_ROUTES, PipelineType from codegate.providers.copilot.pipeline import ( CopilotChatPipeline, @@ -39,7 +39,7 @@ TEMPDIR = tempfile.TemporaryDirectory(prefix="codegate-", dir=basedir, delete=False) -def _dump_data(suffix, func): +def _dump_data(suffix, func, trigger: bytes | None = None): if os.getenv("CODEGATE_DUMP_DIR"): buf = bytearray(b"") @@ -48,7 +48,7 @@ def inner(self, data: bytes): func(self, data) buf.extend(data) - if data == b"0\r\n\r\n": + if not trigger or data == trigger: ts = datetime.datetime.now() fname = os.path.join(TEMPDIR.name, ts.strftime(f"{suffix}-%Y%m%dT%H%M%S%f.txt")) with open(fname, mode="wb") as fd: @@ -64,7 +64,7 @@ def _dump_request(func): def _dump_response(func): - return _dump_data("response", func) + return _dump_data("response", func, b"0\r\n\r\n") # Constants @@ -200,7 +200,7 @@ def __init__(self, loop: asyncio.AbstractEventLoop): self.ca = CertificateAuthority.get_instance() self.cert_manager = TLSCertDomainManager(self.ca) self._closing = False - self.pipeline_factory = PipelineFactory(SecretsManager()) + self.pipeline_factory = PipelineFactory(SensitiveDataManager()) self.input_pipeline: Optional[CopilotPipeline] = None self.fim_pipeline: Optional[CopilotPipeline] = None # the context as provided by the pipeline @@ -336,7 +336,12 @@ def _check_buffer_size(self, new_data: bytes) -> bool: """Check if adding new data would exceed buffer size limit""" return len(self.buffer) + len(new_data) <= MAX_BUFFER_SIZE + @_dump_request + def _dump_create_http_request(self, data: bytes) -> bytes: + return data + async def _forward_data_through_pipeline(self, data: bytes) -> Union[HttpRequest, HttpResponse]: + self._dump_create_http_request(data) http_request = http_request_from_bytes(data) if not http_request: # we couldn't parse this into an HTTP request, so we just pass through diff --git a/src/codegate/providers/crud/crud.py b/src/codegate/providers/crud/crud.py index 0bffe1a8..8bba52b8 100644 --- a/src/codegate/providers/crud/crud.py +++ b/src/codegate/providers/crud/crud.py @@ -401,7 +401,7 @@ async def try_update_to_provider( dbprovend.endpoint, authm.auth_type, authm.auth_blob, prov ) except Exception as err: - logger.error( + logger.info( "Unable to get models from provider. Skipping", provider=dbprovend.name, err=str(err), diff --git a/tests/integration/integration_tests.py b/tests/integration/integration_tests.py index d1ffd794..75bda907 100644 --- a/tests/integration/integration_tests.py +++ b/tests/integration/integration_tests.py @@ -231,7 +231,7 @@ async def _setup_muxing( provider_endpoint = muxing_config.get("provider_endpoint") try: data_with_api_keys = self.replace_env_variables(provider_endpoint["data"], os.environ) - response_create_provider = self.call_codegate( + response_create_provider = self.call_provider( provider=provider, url=provider_endpoint["url"], headers=provider_endpoint["headers"], @@ -250,7 +250,7 @@ async def _setup_muxing( mux["provider_id"] = created_provider_endpoint["id"] # The endpoint actually takes a list - self.call_codegate( + self.call_provider( provider=provider, url=muxes_rules["url"], headers=muxes_rules["headers"], diff --git a/tests/muxing/test_rulematcher.py b/tests/muxing/test_rulematcher.py index 7340d983..7e551525 100644 --- a/tests/muxing/test_rulematcher.py +++ b/tests/muxing/test_rulematcher.py @@ -51,13 +51,13 @@ def test_catch_all(matcher_blob, thing_to_match): [ (None, [], True), # Empty filenames and no blob (None, ["main.py"], True), # Empty blob should match - (".py", ["main.py"], True), # Extension match + ("*.py", ["main.py"], True), # Extension match ("main.py", ["main.py"], True), # Full name match - (".py", ["main.py", "test.py"], True), # Extension match + ("*.py", ["main.py", "test.py"], True), # Extension match ("main.py", ["main.py", "test.py"], True), # Full name match ("main.py", ["test.py"], False), # Full name no match - (".js", ["main.py", "test.py"], False), # Extension no match - (".ts", ["main.tsx", "test.tsx"], False), # Extension no match + ("*.js", ["main.py", "test.py"], False), # Extension no match + ("*.ts", ["main.tsx", "test.tsx"], False), # Extension no match ], ) def test_file_matcher( @@ -89,13 +89,13 @@ def test_file_matcher( [ (None, [], True), # Empty filenames and no blob (None, ["main.py"], True), # Empty blob should match - (".py", ["main.py"], True), # Extension match + ("*.py", ["main.py"], True), # Extension match ("main.py", ["main.py"], True), # Full name match - (".py", ["main.py", "test.py"], True), # Extension match + ("*.py", ["main.py", "test.py"], True), # Extension match ("main.py", ["main.py", "test.py"], True), # Full name match ("main.py", ["test.py"], False), # Full name no match - (".js", ["main.py", "test.py"], False), # Extension no match - (".ts", ["main.tsx", "test.tsx"], False), # Extension no match + ("*.js", ["main.py", "test.py"], False), # Extension no match + ("*.ts", ["main.tsx", "test.tsx"], False), # Extension no match ], ) @pytest.mark.parametrize( diff --git a/tests/muxing/test_semantic_router.py b/tests/muxing/test_semantic_router.py new file mode 100644 index 00000000..c8c7edc6 --- /dev/null +++ b/tests/muxing/test_semantic_router.py @@ -0,0 +1,590 @@ +import uuid +from pathlib import Path +from typing import List + +import pytest +from pydantic import BaseModel + +from codegate.db import connection +from codegate.muxing.semantic_router import PersonaDoesNotExistError, SemanticRouter + + +@pytest.fixture +def db_path(): + """Creates a temporary database file path.""" + current_test_dir = Path(__file__).parent + db_filepath = current_test_dir / f"codegate_test_{uuid.uuid4()}.db" + db_fullpath = db_filepath.absolute() + connection.init_db_sync(str(db_fullpath)) + yield db_fullpath + if db_fullpath.is_file(): + db_fullpath.unlink() + + +@pytest.fixture() +def db_recorder(db_path) -> connection.DbRecorder: + """Creates a DbRecorder instance with test database.""" + return connection.DbRecorder(sqlite_path=db_path, _no_singleton=True) + + +@pytest.fixture() +def db_reader(db_path) -> connection.DbReader: + """Creates a DbReader instance with test database.""" + return connection.DbReader(sqlite_path=db_path, _no_singleton=True) + + +@pytest.fixture() +def semantic_router_mocked_db( + db_recorder: connection.DbRecorder, db_reader: connection.DbReader +) -> SemanticRouter: + """Creates a SemanticRouter instance with mocked database.""" + semantic_router = SemanticRouter() + semantic_router._db_reader = db_reader + semantic_router._db_recorder = db_recorder + return semantic_router + + +@pytest.mark.asyncio +async def test_add_persona(semantic_router_mocked_db: SemanticRouter): + """Test adding a persona to the database.""" + persona_name = "test_persona" + persona_desc = "test_persona_desc" + await semantic_router_mocked_db.add_persona(persona_name, persona_desc) + retrieved_persona = await semantic_router_mocked_db._db_reader.get_persona_by_name(persona_name) + assert retrieved_persona.name == persona_name + assert retrieved_persona.description == persona_desc + + +@pytest.mark.asyncio +async def test_persona_not_exist_match(semantic_router_mocked_db: SemanticRouter): + """Test checking persona match when persona does not exist""" + persona_name = "test_persona" + query = "test_query" + with pytest.raises(PersonaDoesNotExistError): + await semantic_router_mocked_db.check_persona_match(persona_name, query) + + +class PersonaMatchTest(BaseModel): + persona_name: str + persona_desc: str + pass_queries: List[str] + fail_queries: List[str] + + +simple_persona = PersonaMatchTest( + persona_name="test_persona", + persona_desc="test_desc", + pass_queries=["test_desc", "test_desc2"], + fail_queries=["foo"], +) + +software_architect = PersonaMatchTest( + persona_name="software architect", + persona_desc=""" + Expert in designing large-scale software systems and technical infrastructure. + Specializes in distributed systems, microservices architecture, + and cloud-native applications. + Deep knowledge of architectural patterns like CQRS, event sourcing, hexagonal architecture, + and domain-driven design. + Experienced in designing scalable, resilient, and maintainable software solutions. + Proficient in evaluating technology stacks and making strategic technical decisions. + Skilled at creating architecture diagrams, technical specifications, + and system documentation. + Focuses on non-functional requirements like performance, security, and reliability. + Guides development teams on best practices for implementing complex systems. + """, + pass_queries=[ + """ + How should I design a microservices architecture that can handle high traffic loads? + """, + """ + What's the best approach for implementing event sourcing in a distributed system? + """, + """ + I need to design a system that can scale to millions of users. What architecture would you + recommend? + """, + """ + Can you explain the trade-offs between monolithic and microservices architectures for our + new project? + """, + ], + fail_queries=[ + """ + How do I create a simple landing page with HTML and CSS? + """, + """ + What's the best way to optimize my SQL query performance? + """, + """ + Can you help me debug this JavaScript function that's throwing an error? + """, + """ + How do I implement user authentication in my React application? + """, + ], +) + +# Data Scientist Persona +data_scientist = PersonaMatchTest( + persona_name="data scientist", + persona_desc=""" + Expert in analyzing and interpreting complex data to solve business problems. + Specializes in statistical analysis, machine learning algorithms, and predictive modeling. + Builds and deploys models for classification, regression, clustering, and anomaly detection. + Proficient in data preprocessing, feature engineering, and model evaluation techniques. + Uses Python with libraries like NumPy, Pandas, scikit-learn, TensorFlow, and PyTorch. + Experienced with data visualization using Matplotlib, Seaborn, and interactive dashboards. + Applies experimental design principles and A/B testing methodologies. + Works with structured and unstructured data, including time series and text. + Implements data pipelines for model training, validation, and deployment. + Communicates insights and recommendations based on data analysis to stakeholders. + + Handles class imbalance problems in classification tasks using techniques like SMOTE, + undersampling, oversampling, and class weighting. Addresses customer churn prediction + challenges by identifying key features that indicate potential churners. + + Applies feature selection methods for high-dimensional datasets, including filter methods + (correlation, chi-square), wrapper methods (recursive feature elimination), and embedded + methods (LASSO regularization). + + Prevents overfitting and high variance in tree-based models like random forests through + techniques such as pruning, setting maximum depth, adjusting minimum samples per leaf, + and cross-validation. + + Specializes in time series forecasting for sales and demand prediction, using methods like + ARIMA, SARIMA, Prophet, and exponential smoothing to handle seasonal patterns and trends. + Implements forecasting models that account for quarterly business cycles and seasonal + variations in customer behavior. + + Evaluates model performance using appropriate metrics: accuracy, precision, recall, + F1-score + for classification; RMSE, MAE, R-squared for regression; and specialized metrics for + time series forecasting like MAPE and SMAPE. + + Experienced in developing customer segmentation models, recommendation systems, + anomaly detection algorithms, and predictive maintenance solutions. + """, + pass_queries=[ + """ + How should I handle class imbalance in my customer churn prediction model? + """, + """ + What feature selection techniques would work best for my high-dimensional dataset? + """, + """ + I'm getting high variance in my random forest model. How can I prevent overfitting? + """, + """ + What's the best approach for forecasting seasonal time series data for our sales + predictions? + """, + ], + fail_queries=[ + """ + How do I structure my React components for a single-page application? + """, + """ + What's the best way to implement a CI/CD pipeline for my microservices? + """, + """ + Can you help me design a responsive layout for mobile and desktop browsers? + """, + """ + How should I configure my Kubernetes cluster for high availability? + """, + ], +) + +# UX Designer Persona +ux_designer = PersonaMatchTest( + persona_name="ux designer", + persona_desc=""" + Expert in creating intuitive, user-centered digital experiences and interfaces. + Specializes in user research, usability testing, and interaction design. + Creates wireframes, prototypes, and user flows to visualize design solutions. + Conducts user interviews, usability studies, and analyzes user feedback. + Develops user personas and journey maps to understand user needs and pain points. + Designs information architecture and navigation systems for complex applications. + Applies design thinking methodology to solve user experience problems. + Knowledgeable about accessibility standards and inclusive design principles. + Collaborates with product managers and developers to implement user-friendly features. + Uses tools like Figma, Sketch, and Adobe XD to create high-fidelity mockups. + """, + pass_queries=[ + """ + How can I improve the user onboarding experience for my mobile application? + """, + """ + What usability testing methods would you recommend for evaluating our new interface design? + """, + """ + I'm designing a complex dashboard. What information architecture would make it most + intuitive for users? + """, + """ + How should I structure user research to identify pain points in our current + checkout process? + """, + ], + fail_queries=[ + """ + How do I configure a load balancer for my web servers? + """, + """ + What's the best way to implement a caching layer in my application? + """, + """ + Can you explain how to set up a CI/CD pipeline with GitHub Actions? + """, + """ + How do I optimize my database queries for better performance? + """, + ], +) + +# DevOps Engineer Persona +devops_engineer = PersonaMatchTest( + persona_name="devops engineer", + persona_desc=""" + Expertise: Infrastructure automation, CI/CD pipelines, cloud services, containerization, + and monitoring. + Proficient with tools like Docker, Kubernetes, Terraform, Ansible, and Jenkins. + Experienced with cloud platforms including AWS, Azure, and Google Cloud. + Strong knowledge of Linux/Unix systems administration and shell scripting. + Skilled in implementing microservices architectures and service mesh technologies. + Focus on reliability, scalability, security, and operational efficiency. + Practices infrastructure as code, GitOps, and site reliability engineering principles. + Experienced with monitoring tools like Prometheus, Grafana, and ELK stack. + """, + pass_queries=[ + """ + What's the best way to set up auto-scaling for my Kubernetes cluster on AWS? + """, + """ + I need to implement a zero-downtime deployment strategy for my microservices. + What approaches would you recommend? + """, + """ + How can I improve the security of my CI/CD pipeline and prevent supply chain attacks? + """, + """ + What monitoring metrics should I track to ensure the reliability of my distributed system? + """, + ], + fail_queries=[ + """ + How do I design an effective user onboarding flow for my mobile app? + """, + """ + What's the best algorithm for sentiment analysis on customer reviews? + """, + """ + Can you help me with color theory for my website redesign? + """, + """ + I need advice on optimizing my SQL queries for a reporting dashboard. + """, + ], +) + +# Security Specialist Persona +security_specialist = PersonaMatchTest( + persona_name="security specialist", + persona_desc=""" + Expert in cybersecurity, application security, and secure system design. + Specializes in identifying and mitigating security vulnerabilities and threats. + Performs security assessments, penetration testing, and code security reviews. + Implements security controls like authentication, authorization, and encryption. + Knowledgeable about common attack vectors such as injection attacks, XSS, CSRF, and SSRF. + Experienced with security frameworks and standards like OWASP Top 10, NIST, and ISO 27001. + Designs secure architectures and implements defense-in-depth strategies. + Conducts security incident response and forensic analysis. + Implements security monitoring, logging, and alerting systems. + Stays current with emerging security threats and mitigation techniques. + """, + pass_queries=[ + """ + How can I protect my web application from SQL injection attacks? + """, + """ + What security controls should I implement for storing sensitive user data? + """, + """ + How do I conduct a thorough security assessment of our cloud infrastructure? + """, + """ + What's the best approach for implementing secure authentication in my API? + """, + ], + fail_queries=[ + """ + How do I optimize the loading speed of my website? + """, + """ + What's the best way to implement responsive design for mobile devices? + """, + """ + Can you help me design a database schema for my e-commerce application? + """, + """ + How should I structure my React components for better code organization? + """, + ], +) + +# Mobile Developer Persona +mobile_developer = PersonaMatchTest( + persona_name="mobile developer", + persona_desc=""" + Expert in building native and cross-platform mobile applications for iOS and Android. + Specializes in mobile UI development, responsive layouts, and platform-specific + design patterns. + Proficient in Swift and SwiftUI for iOS, Kotlin for Android, and React Native or + Flutter for cross-platform. + Implements mobile-specific features like push notifications, offline storage, and + location services. + Optimizes mobile applications for performance, battery efficiency, and limited + network connectivity. + Experienced with mobile app architecture patterns like MVVM, MVC, and Redux. + Integrates with device hardware features including camera, biometrics, sensors, + and Bluetooth. + Familiar with app store submission processes, app signing, and distribution workflows. + Implements secure data storage, authentication, and API communication on mobile devices. + Designs and develops responsive interfaces that work across different screen sizes + and orientations. + + Implements sophisticated offline-first data synchronization strategies + for mobile applications, + handling conflict resolution, data merging, and background syncing when connectivity + is restored. + Uses technologies like Realm, SQLite, Core Data, and Room Database to enable seamless + offline + experiences in React Native and native apps. + + Structures Swift code following the MVVM (Model-View-ViewModel) architectural pattern + to create + maintainable, testable iOS applications. Implements proper separation of concerns + with bindings + between views and view models using Combine, RxSwift, or SwiftUI's native state management. + + Specializes in deep linking implementation for both Android and iOS, enabling app-to-app + communication, marketing campaign tracking, and seamless user experiences when navigating + between web and mobile contexts. Configures Universal Links, App Links, and custom URL + schemes. + + Optimizes battery usage for location-based features by implementing intelligent location + tracking + strategies, including geofencing, significant location changes, deferred location updates, + and + region monitoring. Balances accuracy requirements with power consumption constraints. + + Develops efficient state management solutions for complex mobile applications using Redux, + MobX, Provider, or Riverpod for React Native apps, and native state management approaches + for iOS and Android. + + Creates responsive mobile interfaces that adapt to different device orientations, + screen sizes, + and pixel densities using constraint layouts, auto layout, size classes, and flexible + grid systems. + """, + pass_queries=[ + """ + What's the best approach for implementing offline-first data synchronization in my mobile + app? + """, + """ + How should I structure my Swift code to implement the MVVM pattern effectively? + """, + """ + What's the most efficient way to handle deep linking and app-to-app communication on + Android? + """, + """ + How can I optimize battery usage when implementing background location tracking? + """, + ], + fail_queries=[ + """ + How do I design a database schema with proper normalization for my web application? + """, + """ + What's the best approach for implementing a distributed caching layer in my microservices? + """, + """ + Can you help me set up a data pipeline for processing large datasets with Apache Spark? + """, + """ + How should I configure my load balancer to distribute traffic across my web servers? + """, + ], +) + +# Database Administrator Persona +database_administrator = PersonaMatchTest( + persona_name="database administrator", + persona_desc=""" + Expert in designing, implementing, and managing database systems for optimal performance and + reliability. + Specializes in database architecture, schema design, and query optimization techniques. + Proficient with relational databases like PostgreSQL, MySQL, Oracle, and SQL Server. + Implements and manages database security, access controls, and data protection measures. + Designs high-availability solutions using replication, clustering, and failover mechanisms. + Develops and executes backup strategies, disaster recovery plans, and data retention + policies. + Monitors database performance, identifies bottlenecks, and implements optimization + solutions. + Creates and maintains indexes, partitioning schemes, and other performance-enhancing + structures. + Experienced with database migration, version control, and change management processes. + Implements data integrity constraints, stored procedures, triggers, and database automation. + + Optimizes complex JOIN query performance in PostgreSQL through advanced techniques including + query rewriting, proper indexing strategies, materialized views, and query plan analysis. + Uses EXPLAIN ANALYZE to identify bottlenecks in query execution plans and implements + appropriate optimizations for specific query patterns. + + Designs and implements high-availability MySQL configurations with automatic failover using + technologies like MySQL Group Replication, Galera Cluster, Percona XtraDB Cluster, or MySQL + InnoDB Cluster with MySQL Router. Configures synchronous and asynchronous replication + strategies + to balance consistency and performance requirements. + + Develops sophisticated indexing strategies for tables with frequent write operations and + complex + read queries, balancing write performance with read optimization. Implements partial + indexes, + covering indexes, and composite indexes based on query patterns and cardinality analysis. + + Specializes in large-scale database migrations between different database engines, + particularly + Oracle to PostgreSQL transitions. Uses tools like ora2pg, AWS DMS, and custom ETL processes + to + ensure data integrity, schema compatibility, and minimal downtime during migration. + + Implements table partitioning schemes based on data access patterns, including range + partitioning + for time-series data, list partitioning for categorical data, and hash partitioning for + evenly + distributed workloads. + + Configures and manages database connection pooling, query caching, and buffer management to + optimize resource utilization and throughput under varying workloads. + + Designs and implements database sharding strategies for horizontal scaling, including + consistent hashing algorithms, shard key selection, and cross-shard query optimization. + """, + pass_queries=[ + """ + How can I optimize the performance of complex JOIN queries in my PostgreSQL database? + """, + """ + What's the best approach for implementing a high-availability MySQL setup with automatic + failover? + """, + """ + How should I design my indexing strategy for a table with frequent writes and complex read + queries? + """, + """ + What's the most efficient way to migrate a large Oracle database to PostgreSQL with minimal + downtime? + """, + ], + fail_queries=[ + """ + How do I structure my React components to implement the Redux state management pattern? + """, + """ + What's the best approach for implementing responsive design with CSS Grid and Flexbox? + """, + """ + Can you help me set up a CI/CD pipeline for my containerized microservices? + """, + ], +) + +# Natural Language Processing Specialist Persona +nlp_specialist = PersonaMatchTest( + persona_name="nlp specialist", + persona_desc=""" + Expertise: Natural language processing, computational linguistics, and text analytics. + Proficient with NLP libraries and frameworks like NLTK, spaCy, Hugging Face Transformers, + and Gensim. + Experience with language models such as BERT, GPT, T5, and their applications. + Skilled in text preprocessing, tokenization, lemmatization, and feature extraction + techniques. + Knowledge of sentiment analysis, named entity recognition, topic modeling, and text + classification. + Familiar with word embeddings, contextual embeddings, and language representation methods. + Understanding of machine translation, question answering, and text summarization systems. + Background in information retrieval, semantic search, and conversational AI development. + """, + pass_queries=[ + """ + What approach should I take to fine-tune BERT for my custom text classification task? + """, + """ + How can I improve the accuracy of my named entity recognition system for medical texts? + """, + """ + What's the best way to implement semantic search using embeddings from language models? + """, + """ + I need to build a sentiment analysis system that can handle sarcasm and idioms. + Any suggestions? + """, + ], + fail_queries=[ + """ + How do I optimize my React components to reduce rendering time? + """, + """ + What's the best approach for implementing a CI/CD pipeline with Jenkins? + """, + """ + Can you help me design a responsive UI for my web application? + """, + """ + How should I structure my microservices architecture for scalability? + """, + ], +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "persona_match_test", + [ + simple_persona, + software_architect, + data_scientist, + ux_designer, + devops_engineer, + security_specialist, + mobile_developer, + database_administrator, + nlp_specialist, + ], +) +async def test_check_persona_match( + semantic_router_mocked_db: SemanticRouter, persona_match_test: PersonaMatchTest +): + """Test checking persona match.""" + await semantic_router_mocked_db.add_persona( + persona_match_test.persona_name, persona_match_test.persona_desc + ) + + # Check for the queries that should pass + for query in persona_match_test.pass_queries: + match = await semantic_router_mocked_db.check_persona_match( + persona_match_test.persona_name, query + ) + assert match is True + + # Check for the queries that should fail + for query in persona_match_test.fail_queries: + match = await semantic_router_mocked_db.check_persona_match( + persona_match_test.persona_name, query + ) + assert match is False diff --git a/tests/pipeline/pii/test_analyzer.py b/tests/pipeline/pii/test_analyzer.py index 8d5a7c6e..d626b8cf 100644 --- a/tests/pipeline/pii/test_analyzer.py +++ b/tests/pipeline/pii/test_analyzer.py @@ -3,44 +3,7 @@ import pytest from presidio_analyzer import RecognizerResult -from codegate.pipeline.pii.analyzer import PiiAnalyzer, PiiSessionStore - - -class TestPiiSessionStore: - def test_init_with_session_id(self): - session_id = "test-session" - store = PiiSessionStore(session_id) - assert store.session_id == session_id - assert store.mappings == {} - - def test_init_without_session_id(self): - store = PiiSessionStore() - assert isinstance(store.session_id, str) - assert len(store.session_id) > 0 - assert store.mappings == {} - - def test_add_mapping(self): - store = PiiSessionStore() - pii = "test@example.com" - placeholder = store.add_mapping(pii) - - assert placeholder.startswith("<") - assert placeholder.endswith(">") - assert store.mappings[placeholder] == pii - - def test_get_pii_existing(self): - store = PiiSessionStore() - pii = "test@example.com" - placeholder = store.add_mapping(pii) - - result = store.get_pii(placeholder) - assert result == pii - - def test_get_pii_nonexistent(self): - store = PiiSessionStore() - placeholder = "" - result = store.get_pii(placeholder) - assert result == placeholder +from codegate.pipeline.pii.analyzer import PiiAnalyzer class TestPiiAnalyzer: @@ -104,68 +67,31 @@ def test_singleton_pattern(self): with pytest.raises(RuntimeError, match="Use PiiAnalyzer.get_instance()"): PiiAnalyzer() - def test_analyze_no_pii(self, analyzer, mock_analyzer_engine): - text = "Hello world" - mock_analyzer_engine.analyze.return_value = [] - - result_text, found_pii, session_store = analyzer.analyze(text) - - assert result_text == text - assert found_pii == [] - assert isinstance(session_store, PiiSessionStore) - - def test_analyze_with_pii(self, analyzer, mock_analyzer_engine): - text = "My email is test@example.com" - email_pii = RecognizerResult( - entity_type="EMAIL_ADDRESS", - start=12, - end=28, - score=1.0, # EmailRecognizer returns a score of 1.0 - ) - mock_analyzer_engine.analyze.return_value = [email_pii] - - result_text, found_pii, session_store = analyzer.analyze(text) - - assert len(found_pii) == 1 - pii_info = found_pii[0] - assert pii_info["type"] == "EMAIL_ADDRESS" - assert pii_info["value"] == "test@example.com" - assert pii_info["score"] == 1.0 - assert pii_info["start"] == 12 - assert pii_info["end"] == 28 - assert "uuid_placeholder" in pii_info - # Verify the placeholder was used to replace the PII - placeholder = pii_info["uuid_placeholder"] - assert result_text == f"My email is {placeholder}" - # Verify the mapping was stored - assert session_store.get_pii(placeholder) == "test@example.com" - def test_restore_pii(self, analyzer): - session_store = PiiSessionStore() original_text = "test@example.com" - placeholder = session_store.add_mapping(original_text) - anonymized_text = f"My email is {placeholder}" + session_id = "session-id" - restored_text = analyzer.restore_pii(anonymized_text, session_store) + placeholder = analyzer.session_store.add_mapping(session_id, original_text) + anonymized_text = f"My email is {placeholder}" + restored_text = analyzer.restore_pii(session_id, anonymized_text) assert restored_text == f"My email is {original_text}" def test_restore_pii_multiple(self, analyzer): - session_store = PiiSessionStore() email = "test@example.com" phone = "123-456-7890" - email_placeholder = session_store.add_mapping(email) - phone_placeholder = session_store.add_mapping(phone) + session_id = "session-id" + email_placeholder = analyzer.session_store.add_mapping(session_id, email) + phone_placeholder = analyzer.session_store.add_mapping(session_id, phone) anonymized_text = f"Email: {email_placeholder}, Phone: {phone_placeholder}" - restored_text = analyzer.restore_pii(anonymized_text, session_store) + restored_text = analyzer.restore_pii(session_id, anonymized_text) assert restored_text == f"Email: {email}, Phone: {phone}" def test_restore_pii_no_placeholders(self, analyzer): - session_store = PiiSessionStore() text = "No PII here" - - restored_text = analyzer.restore_pii(text, session_store) + session_id = "session-id" + restored_text = analyzer.restore_pii(session_id, text) assert restored_text == text diff --git a/tests/pipeline/pii/test_pi.py b/tests/pipeline/pii/test_pi.py index 6578a7b6..06d2881f 100644 --- a/tests/pipeline/pii/test_pi.py +++ b/tests/pipeline/pii/test_pi.py @@ -4,9 +4,10 @@ from litellm import ChatCompletionRequest, ModelResponse from litellm.types.utils import Delta, StreamingChoices -from codegate.pipeline.base import PipelineContext +from codegate.pipeline.base import PipelineContext, PipelineSensitiveData from codegate.pipeline.output import OutputPipelineContext from codegate.pipeline.pii.pii import CodegatePii, PiiRedactionNotifier, PiiUnRedactionStep +from codegate.pipeline.sensitive_data.manager import SensitiveDataManager class TestCodegatePii: @@ -19,8 +20,9 @@ def mock_config(self): yield mock_config @pytest.fixture - def pii_step(self, mock_config): - return CodegatePii() + def pii_step(self): + mock_sensitive_data_manager = MagicMock() + return CodegatePii(mock_sensitive_data_manager) def test_name(self, pii_step): assert pii_step.name == "codegate-pii" @@ -51,57 +53,6 @@ async def test_process_no_messages(self, pii_step): assert result.request == request assert result.context == context - @pytest.mark.asyncio - async def test_process_with_pii(self, pii_step): - original_text = "My email is test@example.com" - request = ChatCompletionRequest( - model="test-model", messages=[{"role": "user", "content": original_text}] - ) - context = PipelineContext() - - # Mock the PII manager's analyze method - placeholder = "" - pii_details = [ - { - "type": "EMAIL_ADDRESS", - "value": "test@example.com", - "score": 1.0, - "start": 12, - "end": 27, - "uuid_placeholder": placeholder, - } - ] - anonymized_text = f"My email is {placeholder}" - pii_step.pii_manager.analyze = MagicMock(return_value=(anonymized_text, pii_details)) - - result = await pii_step.process(request, context) - - # Verify the user message was anonymized - user_messages = [m for m in result.request["messages"] if m["role"] == "user"] - assert len(user_messages) == 1 - assert user_messages[0]["content"] == anonymized_text - - # Verify metadata was updated - assert result.context.metadata["redacted_pii_count"] == 1 - assert len(result.context.metadata["redacted_pii_details"]) == 1 - # The redacted text should be just the placeholder since that's what _get_redacted_snippet returns # noqa: E501 - assert result.context.metadata["redacted_text"] == placeholder - assert "pii_manager" in result.context.metadata - - # Verify system message was added - system_messages = [m for m in result.request["messages"] if m["role"] == "system"] - assert len(system_messages) == 1 - assert system_messages[0]["content"] == "PII has been redacted" - - def test_restore_pii(self, pii_step): - anonymized_text = "My email is " - original_text = "My email is test@example.com" - pii_step.pii_manager.restore_pii = MagicMock(return_value=original_text) - - restored = pii_step.restore_pii(anonymized_text) - - assert restored == original_text - class TestPiiUnRedactionStep: @pytest.fixture @@ -148,7 +99,7 @@ async def test_process_chunk_with_uuid(self, unredaction_step): StreamingChoices( finish_reason=None, index=0, - delta=Delta(content=f"Text with <{uuid}>"), + delta=Delta(content=f"Text with #{uuid}#"), logprobs=None, ) ], @@ -157,17 +108,16 @@ async def test_process_chunk_with_uuid(self, unredaction_step): object="chat.completion.chunk", ) context = OutputPipelineContext() - input_context = PipelineContext() + manager = SensitiveDataManager() + sensitive = PipelineSensitiveData(manager=manager, session_id="session-id") + input_context = PipelineContext(sensitive=sensitive) # Mock PII manager in input context - mock_pii_manager = MagicMock() - mock_session = MagicMock() - mock_session.get_pii = MagicMock(return_value="test@example.com") - mock_pii_manager.session_store = mock_session - input_context.metadata["pii_manager"] = mock_pii_manager + mock_sensitive_data_manager = MagicMock() + mock_sensitive_data_manager.get_original_value = MagicMock(return_value="test@example.com") + input_context.metadata["sensitive_data_manager"] = mock_sensitive_data_manager result = await unredaction_step.process_chunk(chunk, context, input_context) - assert result[0].choices[0].delta.content == "Text with test@example.com" diff --git a/tests/pipeline/pii/test_pii_manager.py b/tests/pipeline/pii/test_pii_manager.py deleted file mode 100644 index 229b7314..00000000 --- a/tests/pipeline/pii/test_pii_manager.py +++ /dev/null @@ -1,106 +0,0 @@ -from unittest.mock import MagicMock, patch - -import pytest - -from codegate.pipeline.pii.analyzer import PiiSessionStore -from codegate.pipeline.pii.manager import PiiManager - - -class TestPiiManager: - @pytest.fixture - def session_store(self): - """Create a session store that will be shared between the mock and manager""" - return PiiSessionStore() - - @pytest.fixture - def mock_analyzer(self, session_store): - """Create a mock analyzer with the shared session store""" - mock_instance = MagicMock() - mock_instance.analyze = MagicMock() - mock_instance.restore_pii = MagicMock() - mock_instance.session_store = session_store - return mock_instance - - @pytest.fixture - def manager(self, mock_analyzer): - """Create a PiiManager instance with the mocked analyzer""" - with patch("codegate.pipeline.pii.manager.PiiAnalyzer") as mock_analyzer_class: - # Set up the mock class to return our mock instance - mock_analyzer_class.get_instance.return_value = mock_analyzer - # Create the manager which will use our mock - return PiiManager() - - def test_init(self, manager, mock_analyzer): - assert manager.session_store is mock_analyzer.session_store - assert manager.analyzer is mock_analyzer - - def test_analyze_no_pii(self, manager, mock_analyzer): - text = "Hello CodeGate" - session_store = mock_analyzer.session_store - mock_analyzer.analyze.return_value = (text, [], session_store) - - anonymized_text, found_pii = manager.analyze(text) - - assert anonymized_text == text - assert found_pii == [] - assert manager.session_store is session_store - mock_analyzer.analyze.assert_called_once_with(text, context=None) - - def test_analyze_with_pii(self, manager, mock_analyzer): - text = "My email is test@example.com" - session_store = mock_analyzer.session_store - placeholder = "" - pii_details = [ - { - "type": "EMAIL_ADDRESS", - "value": "test@example.com", - "score": 0.85, - "start": 12, - "end": 28, # Fixed end position - "uuid_placeholder": placeholder, - } - ] - anonymized_text = f"My email is {placeholder}" - session_store.mappings[placeholder] = "test@example.com" - mock_analyzer.analyze.return_value = (anonymized_text, pii_details, session_store) - - result_text, found_pii = manager.analyze(text) - - assert "My email is <" in result_text - assert ">" in result_text - assert found_pii == pii_details - assert manager.session_store is session_store - assert manager.session_store.mappings[placeholder] == "test@example.com" - mock_analyzer.analyze.assert_called_once_with(text, context=None) - - def test_restore_pii_no_session(self, manager, mock_analyzer): - text = "Anonymized text" - # Create a new session store that's None - mock_analyzer.session_store = None - - restored_text = manager.restore_pii(text) - - assert restored_text == text - - def test_restore_pii_with_session(self, manager, mock_analyzer): - anonymized_text = "My email is " - original_text = "My email is test@example.com" - manager.session_store.mappings[""] = "test@example.com" - mock_analyzer.restore_pii.return_value = original_text - - restored_text = manager.restore_pii(anonymized_text) - - assert restored_text == original_text - mock_analyzer.restore_pii.assert_called_once_with(anonymized_text, manager.session_store) - - def test_restore_pii_multiple_placeholders(self, manager, mock_analyzer): - anonymized_text = "Email: , Phone: " - original_text = "Email: test@example.com, Phone: 123-456-7890" - manager.session_store.mappings[""] = "test@example.com" - manager.session_store.mappings[""] = "123-456-7890" - mock_analyzer.restore_pii.return_value = original_text - - restored_text = manager.restore_pii(anonymized_text) - - assert restored_text == original_text - mock_analyzer.restore_pii.assert_called_once_with(anonymized_text, manager.session_store) diff --git a/tests/pipeline/secrets/test_gatecrypto.py b/tests/pipeline/secrets/test_gatecrypto.py deleted file mode 100644 index b7de4b19..00000000 --- a/tests/pipeline/secrets/test_gatecrypto.py +++ /dev/null @@ -1,157 +0,0 @@ -import time - -import pytest - -from codegate.pipeline.secrets.gatecrypto import CodeGateCrypto - - -@pytest.fixture -def crypto(): - return CodeGateCrypto() - - -def test_generate_session_key(crypto): - session_id = "test_session" - key = crypto.generate_session_key(session_id) - - assert len(key) == 32 # AES-256 key size - assert session_id in crypto.session_keys - assert isinstance(crypto.session_keys[session_id], tuple) - assert len(crypto.session_keys[session_id]) == 2 - - -def test_get_session_key(crypto): - session_id = "test_session" - original_key = crypto.generate_session_key(session_id) - retrieved_key = crypto.get_session_key(session_id) - - assert original_key == retrieved_key - - -def test_get_expired_session_key(crypto): - session_id = "test_session" - crypto.generate_session_key(session_id) - - # Manually expire the key by modifying its timestamp - key, _ = crypto.session_keys[session_id] - crypto.session_keys[session_id] = (key, time.time() - (crypto.SESSION_KEY_LIFETIME + 10)) - - retrieved_key = crypto.get_session_key(session_id) - assert retrieved_key is None - assert session_id not in crypto.session_keys - - -def test_cleanup_expired_keys(crypto): - # Generate multiple session keys - session_ids = ["session1", "session2", "session3"] - for session_id in session_ids: - crypto.generate_session_key(session_id) - - # Manually expire some keys - key, _ = crypto.session_keys["session1"] - crypto.session_keys["session1"] = (key, time.time() - (crypto.SESSION_KEY_LIFETIME + 10)) - key, _ = crypto.session_keys["session2"] - crypto.session_keys["session2"] = (key, time.time() - (crypto.SESSION_KEY_LIFETIME + 10)) - - crypto.cleanup_expired_keys() - - assert "session1" not in crypto.session_keys - assert "session2" not in crypto.session_keys - assert "session3" in crypto.session_keys - - -def test_encrypt_decrypt_token(crypto): - session_id = "test_session" - original_token = "sensitive_data_123" - - encrypted_token = crypto.encrypt_token(original_token, session_id) - decrypted_token = crypto.decrypt_token(encrypted_token, session_id) - - assert decrypted_token == original_token - - -def test_decrypt_with_expired_session(crypto): - session_id = "test_session" - token = "sensitive_data_123" - - encrypted_token = crypto.encrypt_token(token, session_id) - - # Manually expire the session key - key, _ = crypto.session_keys[session_id] - crypto.session_keys[session_id] = (key, time.time() - (crypto.SESSION_KEY_LIFETIME + 10)) - - with pytest.raises(ValueError, match="Session key expired or invalid."): - crypto.decrypt_token(encrypted_token, session_id) - - -def test_decrypt_with_invalid_session(crypto): - session_id = "test_session" - token = "sensitive_data_123" - - encrypted_token = crypto.encrypt_token(token, session_id) - - with pytest.raises(ValueError, match="Session key expired or invalid."): - crypto.decrypt_token(encrypted_token, "invalid_session") - - -def test_decrypt_with_expired_token(crypto, monkeypatch): - session_id = "test_session" - token = "sensitive_data_123" - current_time = time.time() - - # Mock time.time() for token encryption - monkeypatch.setattr(time, "time", lambda: current_time) - - # Generate token with current timestamp - encrypted_token = crypto.encrypt_token(token, session_id) - - # Mock time.time() to return a future timestamp for decryption - future_time = current_time + crypto.SESSION_KEY_LIFETIME + 10 - monkeypatch.setattr(time, "time", lambda: future_time) - - # Keep the original key but update its timestamp to keep it valid - key, _ = crypto.session_keys[session_id] - crypto.session_keys[session_id] = (key, future_time) - - with pytest.raises(ValueError, match="Token has expired."): - crypto.decrypt_token(encrypted_token, session_id) - - -def test_wipe_bytearray(crypto): - # Create a bytearray with sensitive data - sensitive_data = bytearray(b"sensitive_information") - original_content = sensitive_data.copy() - - # Wipe the data - crypto.wipe_bytearray(sensitive_data) - - # Verify all bytes are zeroed - assert all(byte == 0 for byte in sensitive_data) - assert sensitive_data != original_content - - -def test_wipe_bytearray_invalid_input(crypto): - # Try to wipe a string instead of bytearray - with pytest.raises(ValueError, match="Only bytearray objects can be securely wiped."): - crypto.wipe_bytearray("not a bytearray") - - -def test_encrypt_decrypt_with_special_characters(crypto): - session_id = "test_session" - special_chars_token = "!@#$%^&*()_+-=[]{}|;:,.<>?" - - encrypted_token = crypto.encrypt_token(special_chars_token, session_id) - decrypted_token = crypto.decrypt_token(encrypted_token, session_id) - - assert decrypted_token == special_chars_token - - -def test_encrypt_decrypt_multiple_tokens(crypto): - session_id = "test_session" - tokens = ["token1", "token2", "token3"] - - # Encrypt and immediately decrypt each token - for token in tokens: - encrypted = crypto.encrypt_token(token, session_id) - decrypted = crypto.decrypt_token(encrypted, session_id) - assert decrypted == token diff --git a/tests/pipeline/secrets/test_manager.py b/tests/pipeline/secrets/test_manager.py deleted file mode 100644 index 177e8f3f..00000000 --- a/tests/pipeline/secrets/test_manager.py +++ /dev/null @@ -1,149 +0,0 @@ -import pytest - -from codegate.pipeline.secrets.manager import SecretsManager - - -class TestSecretsManager: - def setup_method(self): - """Setup a fresh SecretsManager for each test""" - self.manager = SecretsManager() - self.test_session = "test_session_id" - self.test_value = "super_secret_value" - self.test_service = "test_service" - self.test_type = "api_key" - - def test_store_secret(self): - """Test basic secret storage and retrieval""" - # Store a secret - encrypted = self.manager.store_secret( - self.test_value, self.test_service, self.test_type, self.test_session - ) - - # Verify the secret was stored - stored = self.manager.get_by_session_id(self.test_session) - assert isinstance(stored, dict) - assert stored[encrypted].original == self.test_value - - # Verify encrypted value can be retrieved - retrieved = self.manager.get_original_value(encrypted, self.test_session) - assert retrieved == self.test_value - - def test_get_original_value_wrong_session(self): - """Test that secrets can't be accessed with wrong session ID""" - encrypted = self.manager.store_secret( - self.test_value, self.test_service, self.test_type, self.test_session - ) - - # Try to retrieve with wrong session ID - wrong_session = "wrong_session_id" - retrieved = self.manager.get_original_value(encrypted, wrong_session) - assert retrieved is None - - def test_get_original_value_nonexistent(self): - """Test handling of non-existent encrypted values""" - retrieved = self.manager.get_original_value("nonexistent", self.test_session) - assert retrieved is None - - def test_cleanup_session(self): - """Test that session cleanup properly removes secrets""" - # Store multiple secrets in different sessions - session1 = "session1" - session2 = "session2" - - encrypted1 = self.manager.store_secret("secret1", "service1", "type1", session1) - encrypted2 = self.manager.store_secret("secret2", "service2", "type2", session2) - - # Clean up session1 - self.manager.cleanup_session(session1) - - # Verify session1 secrets are gone - assert self.manager.get_by_session_id(session1) is None - assert self.manager.get_original_value(encrypted1, session1) is None - - # Verify session2 secrets remain - assert self.manager.get_by_session_id(session2) is not None - assert self.manager.get_original_value(encrypted2, session2) == "secret2" - - def test_cleanup(self): - """Test that cleanup properly wipes all data""" - # Store multiple secrets - self.manager.store_secret("secret1", "service1", "type1", "session1") - self.manager.store_secret("secret2", "service2", "type2", "session2") - - # Perform cleanup - self.manager.cleanup() - - # Verify all data is wiped - assert len(self.manager._session_store) == 0 - assert len(self.manager._encrypted_to_session) == 0 - - def test_multiple_secrets_same_session(self): - """Test storing multiple secrets in the same session""" - # Store multiple secrets in same session - encrypted1 = self.manager.store_secret("secret1", "service1", "type1", self.test_session) - encrypted2 = self.manager.store_secret("secret2", "service2", "type2", self.test_session) - - # Latest secret should be retrievable in the session - stored = self.manager.get_by_session_id(self.test_session) - assert isinstance(stored, dict) - assert stored[encrypted1].original == "secret1" - assert stored[encrypted2].original == "secret2" - - # Both secrets should be retrievable directly - assert self.manager.get_original_value(encrypted1, self.test_session) == "secret1" - assert self.manager.get_original_value(encrypted2, self.test_session) == "secret2" - - # Both encrypted values should map to the session - assert self.manager._encrypted_to_session[encrypted1] == self.test_session - assert self.manager._encrypted_to_session[encrypted2] == self.test_session - - def test_error_handling(self): - """Test error handling in secret operations""" - # Test with None values - with pytest.raises(ValueError): - self.manager.store_secret(None, self.test_service, self.test_type, self.test_session) - - with pytest.raises(ValueError): - self.manager.store_secret(self.test_value, None, self.test_type, self.test_session) - - with pytest.raises(ValueError): - self.manager.store_secret(self.test_value, self.test_service, None, self.test_session) - - with pytest.raises(ValueError): - self.manager.store_secret(self.test_value, self.test_service, self.test_type, None) - - def test_secure_cleanup(self): - """Test that cleanup securely wipes sensitive data""" - # Store a secret - self.manager.store_secret( - self.test_value, self.test_service, self.test_type, self.test_session - ) - - # Get reference to stored data before cleanup - stored = self.manager.get_by_session_id(self.test_session) - assert len(stored) == 1 - - # Perform cleanup - self.manager.cleanup() - - # Verify the original string was overwritten, not just removed - # This test is a bit tricky since Python strings are immutable, - # but we can at least verify the data is no longer accessible - assert self.test_value not in str(self.manager._session_store) - - def test_session_isolation(self): - """Test that sessions are properly isolated""" - session1 = "session1" - session2 = "session2" - - # Store secrets in different sessions - encrypted1 = self.manager.store_secret("secret1", "service1", "type1", session1) - encrypted2 = self.manager.store_secret("secret2", "service2", "type2", session2) - - # Verify cross-session access is not possible - assert self.manager.get_original_value(encrypted1, session2) is None - assert self.manager.get_original_value(encrypted2, session1) is None - - # Verify correct session access works - assert self.manager.get_original_value(encrypted1, session1) == "secret1" - assert self.manager.get_original_value(encrypted2, session2) == "secret2" diff --git a/tests/pipeline/secrets/test_secrets.py b/tests/pipeline/secrets/test_secrets.py index 759b94b0..3f272b5b 100644 --- a/tests/pipeline/secrets/test_secrets.py +++ b/tests/pipeline/secrets/test_secrets.py @@ -7,13 +7,13 @@ from codegate.pipeline.base import PipelineContext, PipelineSensitiveData from codegate.pipeline.output import OutputPipelineContext -from codegate.pipeline.secrets.manager import SecretsManager from codegate.pipeline.secrets.secrets import ( SecretsEncryptor, SecretsObfuscator, SecretUnredactionStep, ) from codegate.pipeline.secrets.signatures import CodegateSignatures, Match +from codegate.pipeline.sensitive_data.manager import SensitiveData, SensitiveDataManager class TestSecretsModifier: @@ -69,9 +69,11 @@ class TestSecretsEncryptor: def setup(self, temp_yaml_file): CodegateSignatures.initialize(temp_yaml_file) self.context = PipelineContext() - self.secrets_manager = SecretsManager() + self.sensitive_data_manager = SensitiveDataManager() self.session_id = "test_session" - self.encryptor = SecretsEncryptor(self.secrets_manager, self.context, self.session_id) + self.encryptor = SecretsEncryptor( + self.sensitive_data_manager, self.context, self.session_id + ) def test_hide_secret(self): # Create a test match @@ -87,12 +89,12 @@ def test_hide_secret(self): # Test secret hiding hidden = self.encryptor._hide_secret(match) - assert hidden.startswith("REDACTED<$") + assert hidden.startswith("REDACTED<") assert hidden.endswith(">") # Verify the secret was stored - encrypted_value = hidden[len("REDACTED<$") : -1] - original = self.secrets_manager.get_original_value(encrypted_value, self.session_id) + encrypted_value = hidden[len("REDACTED<") : -1] + original = self.sensitive_data_manager.get_original_value(self.session_id, encrypted_value) assert original == "AKIAIOSFODNN7EXAMPLE" def test_obfuscate(self): @@ -101,7 +103,7 @@ def test_obfuscate(self): protected, matched_secrets = self.encryptor.obfuscate(text, None) assert len(matched_secrets) == 1 - assert "REDACTED<$" in protected + assert "REDACTED<" in protected assert "AKIAIOSFODNN7EXAMPLE" not in protected assert "Other text" in protected @@ -171,25 +173,24 @@ def setup_method(self): """Setup fresh instances for each test""" self.step = SecretUnredactionStep() self.context = OutputPipelineContext() - self.secrets_manager = SecretsManager() + self.sensitive_data_manager = SensitiveDataManager() self.session_id = "test_session" # Setup input context with secrets manager self.input_context = PipelineContext() self.input_context.sensitive = PipelineSensitiveData( - manager=self.secrets_manager, session_id=self.session_id + manager=self.sensitive_data_manager, session_id=self.session_id ) @pytest.mark.asyncio async def test_complete_marker_processing(self): """Test processing of a complete REDACTED marker""" # Store a secret - encrypted = self.secrets_manager.store_secret( - "secret_value", "test_service", "api_key", self.session_id - ) + obj = SensitiveData(original="secret_value", service="test_service", type="api_key") + encrypted = self.sensitive_data_manager.store(self.session_id, obj) # Add content with REDACTED marker to buffer - self.context.buffer.append(f"Here is the REDACTED<${encrypted}> in text") + self.context.buffer.append(f"Here is the REDACTED<{encrypted}> in text") # Process a chunk result = await self.step.process_chunk( @@ -204,7 +205,7 @@ async def test_complete_marker_processing(self): async def test_partial_marker_buffering(self): """Test handling of partial REDACTED markers""" # Add partial marker to buffer - self.context.buffer.append("Here is REDACTED<$") + self.context.buffer.append("Here is REDACTED<") # Process a chunk result = await self.step.process_chunk( @@ -218,7 +219,7 @@ async def test_partial_marker_buffering(self): async def test_invalid_encrypted_value(self): """Test handling of invalid encrypted values""" # Add content with invalid encrypted value - self.context.buffer.append("Here is REDACTED<$invalid_value> in text") + self.context.buffer.append("Here is REDACTED in text") # Process chunk result = await self.step.process_chunk( @@ -227,7 +228,7 @@ async def test_invalid_encrypted_value(self): # Should keep the REDACTED marker for invalid values assert len(result) == 1 - assert result[0].choices[0].delta.content == "Here is REDACTED<$invalid_value> in text" + assert result[0].choices[0].delta.content == "Here is REDACTED in text" @pytest.mark.asyncio async def test_missing_context(self): @@ -271,12 +272,11 @@ async def test_no_markers(self): async def test_wrong_session(self): """Test unredaction with wrong session ID""" # Store secret with one session - encrypted = self.secrets_manager.store_secret( - "secret_value", "test_service", "api_key", "different_session" - ) + obj = SensitiveData(original="test_service", service="api_key", type="different_session") + encrypted = self.sensitive_data_manager.store("different_session", obj) # Try to unredact with different session - self.context.buffer.append(f"Here is the REDACTED<${encrypted}> in text") + self.context.buffer.append(f"Here is the REDACTED<{encrypted}> in text") result = await self.step.process_chunk( create_model_response("text"), self.context, self.input_context @@ -284,4 +284,4 @@ async def test_wrong_session(self): # Should keep REDACTED marker when session doesn't match assert len(result) == 1 - assert result[0].choices[0].delta.content == f"Here is the REDACTED<${encrypted}> in text" + assert result[0].choices[0].delta.content == f"Here is the REDACTED<{encrypted}> in text" diff --git a/tests/pipeline/sensitive_data/test_manager.py b/tests/pipeline/sensitive_data/test_manager.py new file mode 100644 index 00000000..6115ad14 --- /dev/null +++ b/tests/pipeline/sensitive_data/test_manager.py @@ -0,0 +1,48 @@ +import json +from unittest.mock import MagicMock, patch +import pytest +from codegate.pipeline.sensitive_data.manager import SensitiveData, SensitiveDataManager +from codegate.pipeline.sensitive_data.session_store import SessionStore + + +class TestSensitiveDataManager: + @pytest.fixture + def mock_session_store(self): + """Mock the SessionStore instance used within SensitiveDataManager.""" + return MagicMock(spec=SessionStore) + + @pytest.fixture + def manager(self, mock_session_store): + """Patch SensitiveDataManager to use the mocked SessionStore.""" + with patch.object(SensitiveDataManager, "__init__", lambda self: None): + manager = SensitiveDataManager() + manager.session_store = mock_session_store # Manually inject the mock + return manager + + def test_store_success(self, manager, mock_session_store): + """Test storing a SensitiveData object successfully.""" + session_id = "session-123" + sensitive_data = SensitiveData(original="secret_value", service="AWS", type="API_KEY") + + # Mock session store behavior + mock_session_store.add_mapping.return_value = "uuid-123" + + result = manager.store(session_id, sensitive_data) + + # Verify correct function calls + mock_session_store.add_mapping.assert_called_once_with( + session_id, sensitive_data.model_dump_json() + ) + assert result == "uuid-123" + + def test_store_invalid_session_id(self, manager): + """Test storing data with an invalid session ID (should return None).""" + sensitive_data = SensitiveData(original="secret_value", service="AWS", type="API_KEY") + result = manager.store("", sensitive_data) # Empty session ID + assert result is None + + def test_store_missing_original_value(self, manager): + """Test storing data without an original value (should return None).""" + sensitive_data = SensitiveData(original="", service="AWS", type="API_KEY") # Empty original + result = manager.store("session-123", sensitive_data) + assert result is None diff --git a/tests/pipeline/sensitive_data/test_session_store.py b/tests/pipeline/sensitive_data/test_session_store.py new file mode 100644 index 00000000..b9ab64fe --- /dev/null +++ b/tests/pipeline/sensitive_data/test_session_store.py @@ -0,0 +1,114 @@ +import uuid +import pytest +from codegate.pipeline.sensitive_data.session_store import SessionStore + + +class TestSessionStore: + @pytest.fixture + def session_store(self): + """Fixture to create a fresh SessionStore instance before each test.""" + return SessionStore() + + def test_add_mapping_creates_uuid(self, session_store): + """Test that add_mapping correctly stores data and returns a UUID.""" + session_id = "session-123" + data = "test-data" + + uuid_placeholder = session_store.add_mapping(session_id, data) + + # Ensure the returned placeholder follows the expected format + assert uuid_placeholder.startswith("#") and uuid_placeholder.endswith("#") + assert len(uuid_placeholder) > 2 # Should have a UUID inside + + # Verify data is correctly stored + stored_data = session_store.get_mapping(session_id, uuid_placeholder) + assert stored_data == data + + def test_add_mapping_creates_unique_uuids(self, session_store): + """Ensure multiple calls to add_mapping generate unique UUIDs.""" + session_id = "session-123" + data1 = "data1" + data2 = "data2" + + uuid_placeholder1 = session_store.add_mapping(session_id, data1) + uuid_placeholder2 = session_store.add_mapping(session_id, data2) + + assert uuid_placeholder1 != uuid_placeholder2 # UUIDs must be unique + + # Ensure data is correctly stored + assert session_store.get_mapping(session_id, uuid_placeholder1) == data1 + assert session_store.get_mapping(session_id, uuid_placeholder2) == data2 + + def test_get_by_session_id(self, session_store): + """Test retrieving all stored mappings for a session ID.""" + session_id = "session-123" + data1 = "data1" + data2 = "data2" + + uuid1 = session_store.add_mapping(session_id, data1) + uuid2 = session_store.add_mapping(session_id, data2) + + stored_session_data = session_store.get_by_session_id(session_id) + + assert uuid1 in stored_session_data + assert uuid2 in stored_session_data + assert stored_session_data[uuid1] == data1 + assert stored_session_data[uuid2] == data2 + + def test_get_by_session_id_not_found(self, session_store): + """Test get_by_session_id when session does not exist (should return None).""" + session_id = "non-existent-session" + assert session_store.get_by_session_id(session_id) is None + + def test_get_mapping_success(self, session_store): + """Test retrieving a specific mapping.""" + session_id = "session-123" + data = "test-data" + + uuid_placeholder = session_store.add_mapping(session_id, data) + + assert session_store.get_mapping(session_id, uuid_placeholder) == data + + def test_get_mapping_not_found(self, session_store): + """Test retrieving a mapping that does not exist (should return None).""" + session_id = "session-123" + uuid_placeholder = "#non-existent-uuid#" + + assert session_store.get_mapping(session_id, uuid_placeholder) is None + + def test_cleanup_session(self, session_store): + """Test that cleanup_session removes all data for a session ID.""" + session_id = "session-123" + session_store.add_mapping(session_id, "test-data") + + # Ensure session exists before cleanup + assert session_store.get_by_session_id(session_id) is not None + + session_store.cleanup_session(session_id) + + # Ensure session is removed after cleanup + assert session_store.get_by_session_id(session_id) is None + + def test_cleanup_session_non_existent(self, session_store): + """Test cleanup_session on a non-existent session (should not raise errors).""" + session_id = "non-existent-session" + session_store.cleanup_session(session_id) # Should not fail + assert session_store.get_by_session_id(session_id) is None + + def test_cleanup(self, session_store): + """Test global cleanup removes all stored sessions.""" + session_id1 = "session-1" + session_id2 = "session-2" + + session_store.add_mapping(session_id1, "data1") + session_store.add_mapping(session_id2, "data2") + + # Ensure sessions exist before cleanup + assert session_store.get_by_session_id(session_id1) is not None + assert session_store.get_by_session_id(session_id2) is not None + + session_store.cleanup() + + # Ensure all sessions are removed after cleanup + assert session_store.get_by_session_id(session_id1) is None + assert session_store.get_by_session_id(session_id2) is None diff --git a/tests/test_server.py b/tests/test_server.py index 1e06c096..aa549810 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -14,19 +14,13 @@ from codegate import __version__ from codegate.pipeline.factory import PipelineFactory -from codegate.pipeline.secrets.manager import SecretsManager +from codegate.pipeline.sensitive_data.manager import SensitiveDataManager from codegate.providers.registry import ProviderRegistry from codegate.server import init_app from src.codegate.cli import UvicornServer, cli from src.codegate.codegate_logging import LogFormat, LogLevel -@pytest.fixture -def mock_secrets_manager(): - """Create a mock secrets manager.""" - return MagicMock(spec=SecretsManager) - - @pytest.fixture def mock_provider_registry(): """Create a mock provider registry.""" @@ -96,9 +90,9 @@ def test_version_endpoint(mock_fetch_latest_version, test_client: TestClient) -> assert response_data["is_latest"] is False -@patch("codegate.pipeline.secrets.manager.SecretsManager") +@patch("codegate.pipeline.sensitive_data.manager.SensitiveDataManager") @patch("codegate.server.get_provider_registry") -def test_provider_registration(mock_registry, mock_secrets_mgr, mock_pipeline_factory) -> None: +def test_provider_registration(mock_registry, mock_pipeline_factory) -> None: """Test that all providers are registered correctly.""" init_app(mock_pipeline_factory)