Mark interlinear annotation chars and Egyptian hieroglyph format controls as non-zero width

Jules-Bertholet · Jules-Bertholet · commit aae585fbd1a2 · 2024-02-12T12:05:06.000-05:00
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -150,15 +150,14 @@ def load_zero_widths() -> "list[bool]":
     """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
     character. `c` is considered a zero-width character if
 
-    - it is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
-      and is not a `Prepended_Concatenation_Mark` (determined from `PropList.txt`),
+    - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
     - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
     - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
     """
 
     zw_map = []
 
-    # Characters with general category  `Cc`, `Cf`, `Mn`, or `Me` have 0 width...
+    # Characters with general category  `Cc`, `Mn`, or `Me` have 0 width...
     with fetch_open("UnicodeData.txt") as categories:
         current = 0
         for line in categories.readlines():
@@ -169,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
                 raw_data[1],
                 raw_data[2],
             ]
-            zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
+            zero_width = cat_code in ["Cc", "Mn", "Me"]
 
             assert current <= codepoint
             while current <= codepoint:
@@ -186,32 +185,9 @@ def load_zero_widths() -> "list[bool]":
             # Catch any leftover codepoints. They must be unassigned (so nonzero width)
             zw_map.append(False)
 
-    # ...unless they are a `Prepended_Concatenation_Mark`.
-    # https://www.unicode.org/reports/tr44/:
-    # "A small class of visible format controls,
-    # which precede and then span a sequence of other characters, usually digits.
-    # These have also been known as "subtending marks",
-    # because most of them take a form which visually extends underneath the sequence of following digits."
-    with fetch_open("PropList.txt") as properties:
-        single = re.compile(r"^([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+")
-        multiple = re.compile(
-            r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+"
-        )
-        for line in properties.readlines():
-            raw_data = None  # (low, high)
-            if match := single.match(line):
-                raw_data = (match.group(1), match.group(1))
-            elif match := multiple.match(line):
-                raw_data = (match.group(1), match.group(2))
-            else:
-                continue
-            low = int(raw_data[0], 16)
-            high = int(raw_data[1], 16)
-            for cp in range(low, high + 1):
-                zw_map[cp] = False
-
     # `Default_Ignorable_Code_Point`s also have 0 width:
     # https://www.unicode.org/faq/unsup_char.html#3
+    # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
     with fetch_open("DerivedCoreProperties.txt") as properties:
         single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
         multiple = re.compile(
@@ -552,8 +528,7 @@ def main(module_filename: str):
     - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
     - Hangul jamo medial vowels & final consonants are zero-width.
     - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
-    - All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
-      except for `Prepended_Concatenation_Mark`s.
+    - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
     - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
     - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
     - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
diff --git a/src/tables.rs b/src/tables.rs
@@ -182,7 +182,7 @@ pub mod charwidth {
         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x47, 0xBD, 0x06, 0x06, 0x06, 0x06,
+        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0xBD, 0x06, 0x06, 0x06, 0x06,
         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
@@ -430,7 +430,7 @@ pub mod charwidth {
         0x6A, 0xAA, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
         0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
-        0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x00, 0x00, 0x00, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x00, 0x00, 0x54, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40, 0x55, 0x01, 0x41, 0x55,
         0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x40, 0x15, 0x55, 0x55,
diff --git a/src/tests.rs b/src/tests.rs
@@ -223,7 +223,7 @@ fn test_jamo() {
 }
 
 #[test]
-fn test_prepended_concatenation_mark() {
+fn test_prepended_concatenation_marks() {
     use super::UnicodeWidthChar;
     #[cfg(feature = "no_std")]
     use core::option::Option::{None, Some};
@@ -233,3 +233,25 @@ fn test_prepended_concatenation_mark() {
     assert_eq!(UnicodeWidthChar::width('\u{08E2}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{110BD}'), Some(1));
 }
+
+#[test]
+fn test_interlinear_annotation_chars() {
+    use super::UnicodeWidthChar;
+    #[cfg(feature = "no_std")]
+    use core::option::Option::{None, Some};
+
+    assert_eq!(UnicodeWidthChar::width('\u{FFF9}'), Some(1));
+    assert_eq!(UnicodeWidthChar::width('\u{FFFA}'), Some(1));
+    assert_eq!(UnicodeWidthChar::width('\u{FFFB}'), Some(1));
+}
+
+#[test]
+fn test_hieroglyph_format_controls() {
+    use super::UnicodeWidthChar;
+    #[cfg(feature = "no_std")]
+    use core::option::Option::{None, Some};
+
+    assert_eq!(UnicodeWidthChar::width('\u{13430}'), Some(1));
+    assert_eq!(UnicodeWidthChar::width('\u{13436}'), Some(1));
+    assert_eq!(UnicodeWidthChar::width('\u{1343C}'), Some(1));
+}