Skip to content

Commit eec13fa

Browse files
Add more canonical equivalence tests
Test that all canonically equivalent sequences in Unicode's `NormalizationTest.txt` have the same width. Currently no changes need to be made to the width logic to ensure these tests pass. However, Unicode 16 is adding a few new characters that will be problematic (the Kirat Rai vowel signs: <https://www.unicode.org/charts/PDF/Unicode-16.0/U160-16D40.pdf>). Adding this test in advance ensures that we won't forget to account for these changes when the time comes.
1 parent 8092f84 commit eec13fa

File tree

3 files changed

+51
-14
lines changed

3 files changed

+51
-14
lines changed

.github/workflows/rust.yml

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,14 @@ jobs:
1717
build:
1818
runs-on: ubuntu-latest
1919
steps:
20-
- uses: actions/checkout@v3
20+
- uses: actions/checkout@v4
21+
- uses: actions/setup-python@v5
22+
with:
23+
python-version: '3.12'
24+
- name: Regen tables
25+
run: cd scripts && python3 unicode.py
26+
- name: Diff tables
27+
run: diff src/tables.rs scripts/tables.rs
2128
- name: Build
2229
run: cargo build --verbose
2330
- name: Run tests
@@ -28,14 +35,4 @@ jobs:
2835
run: cargo fmt --check
2936
- name: Check clippy
3037
run: cargo clippy --lib --tests
31-
regen:
32-
runs-on: ubuntu-latest
33-
steps:
34-
- uses: actions/checkout@v3
35-
- uses: actions/setup-python@v5
36-
with:
37-
python-version: '3.12'
38-
- name: Regen
39-
run: cd scripts && python3 unicode.py
40-
- name: Diff
41-
run: diff src/tables.rs scripts/tables.rs
38+

scripts/unicode.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,11 @@ class OffsetType(enum.IntEnum):
7070

7171
def fetch_open(filename: str):
7272
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
73-
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
73+
fetches it from `https://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
7474
"""
7575
basename = os.path.basename(filename)
7676
if not os.path.exists(basename):
77-
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
77+
os.system(f"curl -O https://www.unicode.org/Public/UNIDATA/{filename}")
7878
try:
7979
return open(basename, encoding="utf-8")
8080
except OSError:
@@ -677,6 +677,9 @@ def main(module_filename: str):
677677
emoji_variations = load_variation_sequences()
678678
variation_table = make_variation_sequence_table(emoji_variations, width_map)
679679

680+
# Download normalization test file for use by tests
681+
fetch_open("NormalizationTest.txt")
682+
680683
print("------------------------")
681684
total_size = 0
682685
for i, table in enumerate(tables):

tests/tests.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11+
use std::{
12+
fs::File,
13+
io::{BufRead, BufReader},
14+
};
15+
1116
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
1217

1318
#[test]
@@ -149,6 +154,38 @@ fn test_canonical_equivalence() {
149154
}
150155
}
151156

157+
/// Requires `NormalizationTest.txt` to be present in the `scripts/` directory.
158+
/// Run the `unicode.py` script to download it.
159+
#[test]
160+
fn test_canonical_equivalence_2() {
161+
let norm_file = BufReader::new(
162+
File::open("scripts/NormalizationTest.txt")
163+
.expect("run `unicode.py` first to download `NormalizationTest.txt`"),
164+
);
165+
for line in norm_file.lines() {
166+
let line = line.unwrap();
167+
if line.is_empty() || line.starts_with('#') || line.starts_with('@') {
168+
continue;
169+
}
170+
let (nfc, postnfc) = line.split_once(';').unwrap();
171+
let (nfd, _) = postnfc.split_once(';').unwrap();
172+
let nfc: String = nfc
173+
.split(' ')
174+
.map(|s| char::try_from(u32::from_str_radix(s, 16).unwrap()).unwrap())
175+
.collect();
176+
let nfd: String = nfd
177+
.split(' ')
178+
.map(|s| char::try_from(u32::from_str_radix(s, 16).unwrap()).unwrap())
179+
.collect();
180+
181+
assert_eq!(
182+
nfc.width(),
183+
nfd.width(),
184+
"width of {nfc:?} differs from {nfd:?}"
185+
);
186+
}
187+
}
188+
152189
#[test]
153190
fn test_emoji_presentation() {
154191
assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1));

0 commit comments

Comments
 (0)