From 5ab1ad817e893dfb704345d586c1cac73aca65f6 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 9 Oct 2024 16:34:07 +0200 Subject: [PATCH 1/2] Add Python bindings --- Cargo.toml | 1 + crates/bpe/bindings/python/.gitignore | 72 +++++++++++++++++++++++ crates/bpe/bindings/python/Cargo.toml | 13 ++++ crates/bpe/bindings/python/pyproject.toml | 21 +++++++ crates/bpe/bindings/python/src/lib.rs | 50 ++++++++++++++++ crates/bpe/bindings/python/test.py | 12 ++++ crates/bpe/src/byte_pair_encoding.rs | 2 +- 7 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 crates/bpe/bindings/python/.gitignore create mode 100644 crates/bpe/bindings/python/Cargo.toml create mode 100644 crates/bpe/bindings/python/pyproject.toml create mode 100644 crates/bpe/bindings/python/src/lib.rs create mode 100755 crates/bpe/bindings/python/test.py diff --git a/Cargo.toml b/Cargo.toml index c91a813..22c20d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "crates/*", + "crates/bpe/bindings/python", ] resolver = "2" diff --git a/crates/bpe/bindings/python/.gitignore b/crates/bpe/bindings/python/.gitignore new file mode 100644 index 0000000..c8f0442 --- /dev/null +++ b/crates/bpe/bindings/python/.gitignore @@ -0,0 +1,72 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/crates/bpe/bindings/python/Cargo.toml b/crates/bpe/bindings/python/Cargo.toml new file mode 100644 index 0000000..3b94b6d --- /dev/null +++ b/crates/bpe/bindings/python/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "python-bpe" +version = "0.1.0" +edition = "2021" + +[lib] +name = "bpe" +crate-type = ["cdylib"] + +[dependencies] +bpe = { version = "0.1", path = "../../../bpe" } +bpe-openai = { version = "0.1", path = "../../../bpe-openai" } +pyo3 = "0.22.3" diff --git a/crates/bpe/bindings/python/pyproject.toml b/crates/bpe/bindings/python/pyproject.toml new file mode 100644 index 0000000..52525a0 --- /dev/null +++ b/crates/bpe/bindings/python/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["maturin>=1.7,<2.0"] +build-backend = "maturin" + +[project] +name = "bpe" +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dynamic = ["version"] + +[tool.maturin] +features = ["pyo3/extension-module"] diff --git a/crates/bpe/bindings/python/src/lib.rs b/crates/bpe/bindings/python/src/lib.rs new file mode 100644 index 0000000..a3139a2 --- /dev/null +++ b/crates/bpe/bindings/python/src/lib.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; + +use pyo3::prelude::*; + +#[pyclass] +struct BytePairEncoding(Cow<'static, ::bpe::byte_pair_encoding::BytePairEncoding>); + +#[pymethods] +impl BytePairEncoding { + fn count(&self, input: Cow<[u8]>) -> usize { + self.0.count(&input) + } + + fn encode_via_backtracking(&self, input: Cow<[u8]>) -> Vec { + self.0.encode_via_backtracking(&input) + } + + fn decode_tokens(&self, tokens: Vec) -> Cow<[u8]> { + Cow::Owned(self.0.decode_tokens(&tokens)) + } +} + +#[pyfunction] +fn r50k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::r50k()))) +} + +#[pyfunction] +fn p50k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::p50k()))) +} + +#[pyfunction] +fn cl100k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::cl100k()))) +} + +#[pyfunction] +fn o200k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::o200k()))) +} + +#[pymodule] +fn bpe(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(r50k, m)?)?; + m.add_function(wrap_pyfunction!(p50k, m)?)?; + m.add_function(wrap_pyfunction!(cl100k, m)?)?; + m.add_function(wrap_pyfunction!(o200k, m)?)?; + Ok(()) +} diff --git a/crates/bpe/bindings/python/test.py b/crates/bpe/bindings/python/test.py new file mode 100755 index 0000000..f07cc83 --- /dev/null +++ b/crates/bpe/bindings/python/test.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +import bpe + +cl100k = bpe.cl100k() + +enc = cl100k.encode_via_backtracking("Hello, world!".encode()) +print(enc) +cnt = cl100k.count("Hello, world!".encode()) +print(cnt) +dec = cl100k.decode_tokens(enc).decode() +print(dec) diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs index f18468e..15cf0ef 100644 --- a/crates/bpe/src/byte_pair_encoding.rs +++ b/crates/bpe/src/byte_pair_encoding.rs @@ -35,7 +35,7 @@ pub(crate) static BPE_O200K: std::sync::LazyLock = /// Representation of the byte pair dictionary. /// This struct provides various conversions. /// We put all of them into a single struct so that they can be reused by different implementations. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct BytePairEncoding { /// All the decoded tokens concatenated into all_tokens: Vec, From 5bc43ef8a89765cc75ec55e344c76935f97e7151 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 26 Feb 2025 15:54:53 +0100 Subject: [PATCH 2/2] Tweak API --- crates/bpe/bindings/python/pyproject.toml | 6 ++ crates/bpe/bindings/python/src/lib.rs | 93 ++++++++++++----------- crates/bpe/bindings/python/test.py | 8 +- 3 files changed, 63 insertions(+), 44 deletions(-) diff --git a/crates/bpe/bindings/python/pyproject.toml b/crates/bpe/bindings/python/pyproject.toml index 38c0532..0709030 100644 --- a/crates/bpe/bindings/python/pyproject.toml +++ b/crates/bpe/bindings/python/pyproject.toml @@ -19,3 +19,9 @@ dynamic = ["version"] [tool.maturin] features = ["pyo3/extension-module"] + +[dependency-groups] +dev = [ + "maturin>=1.8.2", + "pip>=25.0.1", +] diff --git a/crates/bpe/bindings/python/src/lib.rs b/crates/bpe/bindings/python/src/lib.rs index 6a55d3f..6577606 100644 --- a/crates/bpe/bindings/python/src/lib.rs +++ b/crates/bpe/bindings/python/src/lib.rs @@ -2,59 +2,66 @@ use std::borrow::Cow; use pyo3::prelude::*; -#[pyclass] -struct BytePairEncoding(Cow<'static, ::bpe::byte_pair_encoding::BytePairEncoding>); +#[pymodule] +mod bpe { + use super::*; -#[pyclass] -struct Tokenizer(Cow<'static, ::bpe_openai::Tokenizer>); + #[pyclass] + struct BytePairEncoding(&'static ::bpe::byte_pair_encoding::BytePairEncoding); -#[pymethods] -impl BytePairEncoding { - fn count(&self, input: &[u8]) -> usize { - self.0.count(input) - } + #[pymethods] + impl BytePairEncoding { + fn count(&self, input: &[u8]) -> usize { + self.0.count(input) + } - fn encode_via_backtracking(&self, input: &[u8]) -> Vec { - self.0.encode_via_backtracking(input) - } + fn encode_via_backtracking(&self, input: &[u8]) -> Vec { + self.0.encode_via_backtracking(input) + } - fn decode_tokens(&self, tokens: Vec) -> Vec { - self.0.decode_tokens(&tokens) + fn decode_tokens(&self, tokens: Vec) -> Vec { + self.0.decode_tokens(&tokens) + } } -} -#[pymethods] -impl Tokenizer { - fn count(&self, input: &str) -> usize { - self.0.count(&input) - } + #[pymodule] + mod openai { + use super::*; - fn count_till_limit(&self, input: Cow, limit: usize) -> Option { - self.0.count_till_limit(&input, limit) - } + #[pyclass] + struct Tokenizer(&'static ::bpe_openai::Tokenizer); - fn encode(&self, input: Cow) -> Vec { - self.0.encode(&input) - } + #[pymethods] + impl Tokenizer { + fn count(&self, input: &str) -> usize { + self.0.count(&input) + } - fn decode(&self, tokens: Vec) -> Option { - self.0.decode(&tokens) - } -} + fn count_till_limit(&self, input: Cow, limit: usize) -> Option { + self.0.count_till_limit(&input, limit) + } -#[pyfunction] -fn cl100k_base() -> PyResult { - Ok(Tokenizer(Cow::Borrowed(::bpe_openai::cl100k_base()))) -} + fn encode(&self, input: Cow) -> Vec { + self.0.encode(&input) + } -#[pyfunction] -fn o200k_base() -> PyResult { - Ok(Tokenizer(Cow::Borrowed(::bpe_openai::o200k_base()))) -} + fn decode(&self, tokens: Vec) -> Option { + self.0.decode(&tokens) + } -#[pymodule] -fn bpe_openai(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_function(wrap_pyfunction!(cl100k_base, m)?)?; - m.add_function(wrap_pyfunction!(o200k_base, m)?)?; - Ok(()) + fn bpe(&self) -> BytePairEncoding { + BytePairEncoding(&self.0.bpe) + } + } + + #[pyfunction] + fn cl100k_base() -> PyResult { + Ok(Tokenizer(::bpe_openai::cl100k_base())) + } + + #[pyfunction] + fn o200k_base() -> PyResult { + Ok(Tokenizer(::bpe_openai::o200k_base())) + } + } } diff --git a/crates/bpe/bindings/python/test.py b/crates/bpe/bindings/python/test.py index 1cb4944..b9c36bf 100755 --- a/crates/bpe/bindings/python/test.py +++ b/crates/bpe/bindings/python/test.py @@ -2,7 +2,9 @@ import bpe -tok = bpe.cl100k_base() +tok = bpe.openai.cl100k_base() + +## Use tokenizer enc = tok.encode("Hello, world!") print(enc) @@ -10,3 +12,7 @@ print(cnt) dec = tok.decode(enc) print(dec) + +## Use underlying BPE instance + +bpe = tok.bpe()