Skip to content

Commit 68f04b9

Browse files
Merge remote-tracking branch 'origin/main' into python-bindings
2 parents 5ab1ad8 + 4b41c26 commit 68f04b9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2399
-871
lines changed

.github/workflows/ci.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
steps:
2121
- uses: actions/checkout@v4
2222

23-
- uses: rui314/setup-mold@b015f7e3f2938ad3a5ed6e5111a8c6c7c1d6db6e
23+
- uses: rui314/setup-mold@f80524ca6eeaa76759b57fb78ddce5d87a20c720
2424

2525
- name: Build
2626
run: make build
@@ -32,7 +32,7 @@ jobs:
3232
steps:
3333
- uses: actions/checkout@v4
3434

35-
- uses: rui314/setup-mold@b015f7e3f2938ad3a5ed6e5111a8c6c7c1d6db6e
35+
- uses: rui314/setup-mold@f80524ca6eeaa76759b57fb78ddce5d87a20c720
3636

3737
- name: Check formatting and clippy
3838
run: make lint
@@ -43,7 +43,7 @@ jobs:
4343
steps:
4444
- uses: actions/checkout@v4
4545

46-
- uses: rui314/setup-mold@b015f7e3f2938ad3a5ed6e5111a8c6c7c1d6db6e
46+
- uses: rui314/setup-mold@f80524ca6eeaa76759b57fb78ddce5d87a20c720
4747

4848
- name: Run unit tests
4949
run: make test

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22

33
members = [
44
"crates/*",
5+
"crates/bpe/benchmarks",
56
"crates/bpe/bindings/python",
7+
"crates/bpe/tests",
68
]
79
resolver = "2"
810

911
[profile.bench]
1012
debug = true
1113

1214
[profile.release]
13-
debug = true
15+
debug = true

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ A collection of useful algorithms written in Rust. Currently contains:
44

55
- [`geo_filters`](crates/geo_filters): probabilistic data structures that solve the [Distinct Count Problem](https://en.wikipedia.org/wiki/Count-distinct_problem) using geometric filters.
66
- [`bpe`](crates/bpe): fast, correct, and novel algorithms for the [Byte Pair Encoding Algorithm](https://en.wikipedia.org/wiki/Large_language_model#BPE) which are particularly useful for chunking of documents.
7+
- [`bpe-openai`](crates/bpe-openai): Fast tokenizers for OpenAI token sets based on the `bpe` crate.
8+
- [`string-offsets`](crates/string-offsets): converts string positions between bytes, chars, UTF-16 code units, and line numbers. Useful when sending string indices across language boundaries.
79

810
## Background
911

crates/bpe-openai/Cargo.toml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "bpe-openai"
3-
version = "0.1.1"
3+
version = "0.2.1"
44
edition = "2021"
55
description = "Prebuilt fast byte-pair encoders for OpenAI."
66
repository = "https://github.com/github/rust-gems"
@@ -13,16 +13,18 @@ crate-type = ["lib", "staticlib"]
1313
bench = false
1414

1515
[dependencies]
16-
bpe = { version = "0.1.0", path = "../bpe" }
16+
bpe = { version = "0.2.0", path = "../bpe" }
17+
either = "1.13"
18+
regex-automata = "0.4"
1719
rmp-serde = "1"
18-
serde = { version = "1" }
1920

2021
[dev-dependencies]
21-
fancy-regex = "0.13"
22-
tiktoken-rs = { version = "0.5" }
22+
bpe = { version = "0.2.0", path = "../bpe", features = ["rand"] }
23+
tiktoken-rs = "0.6"
2324

2425
[build-dependencies]
25-
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
26+
base64 = "0.22.1"
27+
bpe = { version = "0.2.0", path = "../bpe", features = ["tiktoken"] }
28+
flate2 = "1.0"
2629
rmp-serde = "1"
27-
tiktoken-rs = { version = "0.5" }
28-
serde = { version = "1" }
30+
serde = "1"

crates/bpe-openai/README.md

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,11 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
55
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
66
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
77

8-
Supported token sets:
8+
Supported tokenizers:
99

10-
- r50k
11-
- p50k
1210
- cl100k
1311
- o200k
1412

15-
> **⚠ CAUTION ⚠**
16-
> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
17-
> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
18-
1913
## Usage
2014

2115
Add a dependency by running

crates/bpe-openai/build.rs

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,35 @@
11
use std::env;
22
use std::fs::File;
3+
use std::io::Read;
34
use std::path::PathBuf;
45

5-
use bpe::byte_pair_encoding::BytePairEncoding;
6+
use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
67
use serde::Serialize;
7-
use tiktoken_rs::CoreBPE;
88

99
fn main() {
10-
serialize_tokens(
11-
"r50k",
12-
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
13-
50256,
14-
1,
15-
);
16-
serialize_tokens(
17-
"p50k",
18-
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
19-
50280,
20-
1,
21-
);
22-
serialize_tokens(
23-
"cl100k",
24-
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
25-
100256,
26-
17846336922010275747,
27-
);
28-
serialize_tokens(
29-
"cl100k",
30-
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
31-
100256,
10+
serialize_tiktoken_bpe(
11+
"cl100k_base",
12+
include_bytes!("data/cl100k_base.tiktoken.gz"),
3213
17846336922010275747,
3314
);
34-
serialize_tokens(
35-
"o200k",
36-
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
37-
199998,
15+
serialize_tiktoken_bpe(
16+
"o200k_base",
17+
include_bytes!("data/o200k_base.tiktoken.gz"),
3818
17846336922010275747,
3919
);
4020
println!("cargo::rerun-if-changed=build.rs");
4121
}
4222

43-
fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
23+
fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) {
24+
let mut dec = flate2::read::GzDecoder::new(data);
25+
let mut tiktoken = String::new();
26+
dec.read_to_string(&mut tiktoken).expect("can decode data");
27+
let tokens = read_tiktoken(&tiktoken).expect("can read data");
4428
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
4529
path.push(format!("bpe_{name}.dict"));
4630
let file = File::create(path).expect("can create output file");
4731
let mut serializer = rmp_serde::Serializer::new(file);
48-
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
32+
let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor));
4933
bpe.serialize(&mut serializer)
5034
.expect("serialization succeeds");
5135
}
758 KB
Binary file not shown.
1.62 MB
Binary file not shown.

0 commit comments

Comments
 (0)