Add more canonical equivalence tests

Jules-Bertholet · Jules-Bertholet · commit eec13fa271c8 · 2024-04-27T01:18:02.000-04:00
Test that all canonically equivalent sequences in Unicode's `NormalizationTest.txt` have the same width. Currently no changes need to be made to the width logic to ensure these tests pass. However, Unicode 16 is adding a few new characters that will be problematic (the Kirat Rai vowel signs: <https://www.unicode.org/charts/PDF/Unicode-16.0/U160-16D40.pdf>). Adding this test in advance ensures that we won't forget to account for these changes when the time comes.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -17,7 +17,14 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+    - name: Regen tables
+      run: cd scripts && python3 unicode.py
+    - name: Diff tables
+      run: diff src/tables.rs scripts/tables.rs
     - name: Build
       run: cargo build --verbose
     - name: Run tests
@@ -28,14 +35,4 @@ jobs:
       run: cargo fmt --check
     - name: Check clippy
       run: cargo clippy --lib --tests
-  regen:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v5
-      with:
-        python-version: '3.12'
-    - name: Regen
-      run: cd scripts && python3 unicode.py
-    - name: Diff
-      run: diff src/tables.rs scripts/tables.rs
+
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -70,11 +70,11 @@ class OffsetType(enum.IntEnum):
 
 def fetch_open(filename: str):
     """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
-    fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
+    fetches it from `https://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
     """
     basename = os.path.basename(filename)
     if not os.path.exists(basename):
-        os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
+        os.system(f"curl -O https://www.unicode.org/Public/UNIDATA/{filename}")
     try:
         return open(basename, encoding="utf-8")
     except OSError:
@@ -677,6 +677,9 @@ def main(module_filename: str):
     emoji_variations = load_variation_sequences()
     variation_table = make_variation_sequence_table(emoji_variations, width_map)
 
+    # Download normalization test file for use by tests
+    fetch_open("NormalizationTest.txt")
+
     print("------------------------")
     total_size = 0
     for i, table in enumerate(tables):
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -8,6 +8,11 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use std::{
+    fs::File,
+    io::{BufRead, BufReader},
+};
+
 use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
 
 #[test]
@@ -149,6 +154,38 @@ fn test_canonical_equivalence() {
     }
 }
 
+/// Requires `NormalizationTest.txt` to be present in the `scripts/` directory.
+/// Run the `unicode.py` script to download it.
+#[test]
+fn test_canonical_equivalence_2() {
+    let norm_file = BufReader::new(
+        File::open("scripts/NormalizationTest.txt")
+            .expect("run `unicode.py` first to download `NormalizationTest.txt`"),
+    );
+    for line in norm_file.lines() {
+        let line = line.unwrap();
+        if line.is_empty() || line.starts_with('#') || line.starts_with('@') {
+            continue;
+        }
+        let (nfc, postnfc) = line.split_once(';').unwrap();
+        let (nfd, _) = postnfc.split_once(';').unwrap();
+        let nfc: String = nfc
+            .split(' ')
+            .map(|s| char::try_from(u32::from_str_radix(s, 16).unwrap()).unwrap())
+            .collect();
+        let nfd: String = nfd
+            .split(' ')
+            .map(|s| char::try_from(u32::from_str_radix(s, 16).unwrap()).unwrap())
+            .collect();
+
+        assert_eq!(
+            nfc.width(),
+            nfd.width(),
+            "width of {nfc:?} differs from {nfd:?}"
+        );
+    }
+}
+
 #[test]
 fn test_emoji_presentation() {
     assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1));