Skip to content

Commit 74c8394

Browse files
authored
Merge pull request #42 from Jules-Bertholet/more-normalization-tests
Add more canonical equivalence tests
2 parents 8092f84 + decf378 commit 74c8394

File tree

5 files changed

+19199
-28
lines changed

5 files changed

+19199
-28
lines changed

.github/workflows/rust.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
build:
1818
runs-on: ubuntu-latest
1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
2121
- name: Build
2222
run: cargo build --verbose
2323
- name: Run tests
@@ -28,14 +28,15 @@ jobs:
2828
run: cargo fmt --check
2929
- name: Check clippy
3030
run: cargo clippy --lib --tests
31+
3132
regen:
3233
runs-on: ubuntu-latest
3334
steps:
34-
- uses: actions/checkout@v3
35+
- uses: actions/checkout@v4
3536
- uses: actions/setup-python@v5
3637
with:
3738
python-version: '3.12'
3839
- name: Regen
39-
run: cd scripts && python3 unicode.py
40+
run: rm tests/NormalizationTest.txt && cd scripts && python3 unicode.py
4041
- name: Diff
41-
run: diff src/tables.rs scripts/tables.rs
42+
run: git update-index --refresh && git diff-index --quiet HEAD --

Cargo.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,6 @@ std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
3131
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
3232
compiler_builtins = { version = "0.1", optional = true }
3333

34-
[dev-dependencies]
35-
unicode-normalization = "0.1.23"
36-
3734
[features]
3835
default = []
3936
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']

scripts/unicode.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,13 @@
2727
import os
2828
import re
2929
import sys
30+
import urllib.request
3031
from collections import defaultdict
3132
from itertools import batched
3233

34+
UNICODE_VERSION = "15.1.0"
35+
"""The version of the Unicode data files to download."""
36+
3337
NUM_CODEPOINTS = 0x110000
3438
"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
3539

@@ -61,24 +65,28 @@ class OffsetType(enum.IntEnum):
6165
6266
If this is edited, you must ensure that `emit_module` reflects your changes."""
6367

64-
MODULE_FILENAME = "tables.rs"
65-
"""The filename of the emitted Rust module (will be created in the working directory)"""
68+
MODULE_PATH = "../src/tables.rs"
69+
"""The path of the emitted Rust module (relative to the working directory)"""
6670

6771
Codepoint = int
6872
BitPos = int
6973

7074

71-
def fetch_open(filename: str):
75+
def fetch_open(filename: str, local_prefix: str = ""):
7276
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
73-
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
77+
fetches it from `https://www.unicode.org/Public/`. Exits with code 1 on failure.
7478
"""
7579
basename = os.path.basename(filename)
76-
if not os.path.exists(basename):
77-
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
80+
localname = os.path.join(local_prefix, basename)
81+
if not os.path.exists(localname):
82+
urllib.request.urlretrieve(
83+
f"https://www.unicode.org/Public/{UNICODE_VERSION}/ucd/{filename}",
84+
localname,
85+
)
7886
try:
79-
return open(basename, encoding="utf-8")
87+
return open(localname, encoding="utf-8")
8088
except OSError:
81-
sys.stderr.write(f"cannot load {basename}")
89+
sys.stderr.write(f"cannot load {localname}")
8290
sys.exit(1)
8391

8492

@@ -637,7 +645,7 @@ def emit_module(
637645
module.write("}\n")
638646

639647

640-
def main(module_filename: str):
648+
def main(module_path: str):
641649
"""Obtain character data from the latest version of Unicode, transform it into a multi-level
642650
lookup table for character width, and write a Rust module utilizing that table to
643651
`module_filename`.
@@ -677,6 +685,9 @@ def main(module_filename: str):
677685
emoji_variations = load_variation_sequences()
678686
variation_table = make_variation_sequence_table(emoji_variations, width_map)
679687

688+
# Download normalization test file for use by tests
689+
fetch_open("NormalizationTest.txt", "../tests/")
690+
680691
print("------------------------")
681692
total_size = 0
682693
for i, table in enumerate(tables):
@@ -692,9 +703,9 @@ def main(module_filename: str):
692703
print("------------------------")
693704
print(f" Total size: {total_size} bytes")
694705

695-
emit_module(module_filename, version, tables, variation_table)
696-
print(f'Wrote to "{module_filename}"')
706+
emit_module(module_path, version, tables, variation_table)
707+
print(f'Wrote to "{module_path}"')
697708

698709

699710
if __name__ == "__main__":
700-
main(MODULE_FILENAME)
711+
main(MODULE_PATH)

0 commit comments

Comments
 (0)