Skip to content

Commit aae585f

Browse files
Mark interlinear annotation chars and Egyptian hieroglyph format controls as non-zero width
1 parent 436b0db commit aae585f

File tree

3 files changed

+30
-33
lines changed

3 files changed

+30
-33
lines changed

scripts/unicode.py

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -150,15 +150,14 @@ def load_zero_widths() -> "list[bool]":
150150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151151
character. `c` is considered a zero-width character if
152152
153-
- it is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154-
and is not a `Prepended_Concatenation_Mark` (determined from `PropList.txt`),
153+
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
155154
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
156155
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
157156
"""
158157

159158
zw_map = []
160159

161-
# Characters with general category `Cc`, `Cf`, `Mn`, or `Me` have 0 width...
160+
# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
162161
with fetch_open("UnicodeData.txt") as categories:
163162
current = 0
164163
for line in categories.readlines():
@@ -169,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
169168
raw_data[1],
170169
raw_data[2],
171170
]
172-
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
171+
zero_width = cat_code in ["Cc", "Mn", "Me"]
173172

174173
assert current <= codepoint
175174
while current <= codepoint:
@@ -186,32 +185,9 @@ def load_zero_widths() -> "list[bool]":
186185
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
187186
zw_map.append(False)
188187

189-
# ...unless they are a `Prepended_Concatenation_Mark`.
190-
# https://www.unicode.org/reports/tr44/:
191-
# "A small class of visible format controls,
192-
# which precede and then span a sequence of other characters, usually digits.
193-
# These have also been known as "subtending marks",
194-
# because most of them take a form which visually extends underneath the sequence of following digits."
195-
with fetch_open("PropList.txt") as properties:
196-
single = re.compile(r"^([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+")
197-
multiple = re.compile(
198-
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+"
199-
)
200-
for line in properties.readlines():
201-
raw_data = None # (low, high)
202-
if match := single.match(line):
203-
raw_data = (match.group(1), match.group(1))
204-
elif match := multiple.match(line):
205-
raw_data = (match.group(1), match.group(2))
206-
else:
207-
continue
208-
low = int(raw_data[0], 16)
209-
high = int(raw_data[1], 16)
210-
for cp in range(low, high + 1):
211-
zw_map[cp] = False
212-
213188
# `Default_Ignorable_Code_Point`s also have 0 width:
214189
# https://www.unicode.org/faq/unsup_char.html#3
190+
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
215191
with fetch_open("DerivedCoreProperties.txt") as properties:
216192
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
217193
multiple = re.compile(
@@ -552,8 +528,7 @@ def main(module_filename: str):
552528
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
553529
- Hangul jamo medial vowels & final consonants are zero-width.
554530
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
555-
- All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
556-
except for `Prepended_Concatenation_Mark`s.
531+
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
557532
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
558533
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
559534
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width

src/tables.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ pub mod charwidth {
182182
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
183183
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
184184
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
185-
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x47, 0xBD, 0x06, 0x06, 0x06, 0x06,
185+
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0xBD, 0x06, 0x06, 0x06, 0x06,
186186
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
187187
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
188188
0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
@@ -430,7 +430,7 @@ pub mod charwidth {
430430
0x6A, 0xAA, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
431431
0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
432432
0x55, 0x54, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
433-
0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x00, 0x00, 0x00, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55,
433+
0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x00, 0x00, 0x54, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55,
434434
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55,
435435
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40, 0x55, 0x01, 0x41, 0x55,
436436
0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x40, 0x15, 0x55, 0x55,

src/tests.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ fn test_jamo() {
223223
}
224224

225225
#[test]
226-
fn test_prepended_concatenation_mark() {
226+
fn test_prepended_concatenation_marks() {
227227
use super::UnicodeWidthChar;
228228
#[cfg(feature = "no_std")]
229229
use core::option::Option::{None, Some};
@@ -233,3 +233,25 @@ fn test_prepended_concatenation_mark() {
233233
assert_eq!(UnicodeWidthChar::width('\u{08E2}'), Some(1));
234234
assert_eq!(UnicodeWidthChar::width('\u{110BD}'), Some(1));
235235
}
236+
237+
#[test]
238+
fn test_interlinear_annotation_chars() {
239+
use super::UnicodeWidthChar;
240+
#[cfg(feature = "no_std")]
241+
use core::option::Option::{None, Some};
242+
243+
assert_eq!(UnicodeWidthChar::width('\u{FFF9}'), Some(1));
244+
assert_eq!(UnicodeWidthChar::width('\u{FFFA}'), Some(1));
245+
assert_eq!(UnicodeWidthChar::width('\u{FFFB}'), Some(1));
246+
}
247+
248+
#[test]
249+
fn test_hieroglyph_format_controls() {
250+
use super::UnicodeWidthChar;
251+
#[cfg(feature = "no_std")]
252+
use core::option::Option::{None, Some};
253+
254+
assert_eq!(UnicodeWidthChar::width('\u{13430}'), Some(1));
255+
assert_eq!(UnicodeWidthChar::width('\u{13436}'), Some(1));
256+
assert_eq!(UnicodeWidthChar::width('\u{1343C}'), Some(1));
257+
}

0 commit comments

Comments
 (0)