Skip to content

Commit a6b5a52

Browse files
Don't treat Prepended_Concatenation_Marks as zero width
1 parent 397ab07 commit a6b5a52

File tree

3 files changed

+168
-127
lines changed

3 files changed

+168
-127
lines changed

scripts/unicode.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
148148

149149
def load_zero_widths() -> "list[bool]":
150150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151-
character. `c` is considered a zero-width character if `c` is in general categories
152-
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
153-
if it has the `Default_Ignorable_Code_Point` property (determined by fetching
154-
and processing `DerivedCoreProperties.txt`), or if it has a `Hangul_Syllable_Type`
155-
of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`)."""
151+
character. `c` is considered a zero-width character if
152+
153+
- it is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154+
and is not a `Prepended_Concatenation_Mark` (determined from `PropList.txt`),
155+
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
156+
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
157+
"""
156158

157159
zw_map = []
158160

161+
# Characters with general category `Cc`, `Cf`, `Mn`, or `Me` have 0 width...
159162
with fetch_open("UnicodeData.txt") as categories:
160163
current = 0
161164
for line in categories.readlines():
@@ -183,6 +186,31 @@ def load_zero_widths() -> "list[bool]":
183186
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
184187
zw_map.append(False)
185188

189+
# ...unless they are a `Prepended_Concatenation_Mark`.
190+
# https://www.unicode.org/reports/tr44/:
191+
# "A small class of visible format controls,
192+
# which precede and then span a sequence of other characters, usually digits.
193+
# These have also been known as "subtending marks",
194+
# because most of them take a form which visually extends underneath the sequence of following digits."
195+
with fetch_open("PropList.txt") as properties:
196+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+")
197+
multiple = re.compile(
198+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+"
199+
)
200+
for line in properties.readlines():
201+
raw_data = None # (low, high)
202+
if match := single.match(line):
203+
raw_data = (match.group(1), match.group(1))
204+
elif match := multiple.match(line):
205+
raw_data = (match.group(1), match.group(2))
206+
else:
207+
continue
208+
low = int(raw_data[0], 16)
209+
high = int(raw_data[1], 16)
210+
for cp in range(low, high + 1):
211+
zw_map[cp] = False
212+
213+
# `Default_Ignorable_Code_Point`s also have 0 width
186214
with fetch_open("DerivedCoreProperties.txt") as properties:
187215
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
188216
multiple = re.compile(
@@ -512,16 +540,18 @@ def main(module_filename: str):
512540
`module_filename`.
513541
514542
We obey the following rules in decreasing order of importance:
515-
- The soft hyphen (`U+00AD`) is single-width.
516-
- Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
517-
- All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
543+
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
544+
- Hangul Jamo medial vowels & final consonants are zero-width.
545+
- All `Default_Ignorable_Code_Point`s are zero-width.
546+
- All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
547+
except for `Prepended_Concatenation_Mark`s.
518548
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
519549
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
520550
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
521-
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
551+
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
522552
523-
These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
524-
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
553+
These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
554+
"""
525555
version = load_unicode_version()
526556
print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
527557

0 commit comments

Comments
 (0)