@@ -148,14 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
148
148
149
149
def load_zero_widths () -> "list[bool]" :
150
150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151
- character. `c` is considered a zero-width character if `c` is in general categories
152
- `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
153
- if it has the `Default_Ignorable_Code_Point` property (determined by fetching
154
- and processing `DerivedCoreProperties.txt`), or if it has a `Hangul_Syllable_Type`
155
- of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`)."""
151
+ character. `c` is considered a zero-width character if
152
+
153
+ - it is in general categories `Cc`, `Cf`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154
+ and is not a `Prepended_Concatenation_Mark` (determined from `PropList.txt`),
155
+ - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
156
+ - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
157
+ """
156
158
157
159
zw_map = []
158
160
161
+ # Characters with general category `Cc`, `Cf`, `Mn`, or `Me` have 0 width...
159
162
with fetch_open ("UnicodeData.txt" ) as categories :
160
163
current = 0
161
164
for line in categories .readlines ():
@@ -183,6 +186,31 @@ def load_zero_widths() -> "list[bool]":
183
186
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
184
187
zw_map .append (False )
185
188
189
+ # ...unless they are a `Prepended_Concatenation_Mark`.
190
+ # https://www.unicode.org/reports/tr44/:
191
+ # "A small class of visible format controls,
192
+ # which precede and then span a sequence of other characters, usually digits.
193
+ # These have also been known as "subtending marks",
194
+ # because most of them take a form which visually extends underneath the sequence of following digits."
195
+ with fetch_open ("PropList.txt" ) as properties :
196
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+" )
197
+ multiple = re .compile (
198
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Prepended_Concatenation_Mark\s+"
199
+ )
200
+ for line in properties .readlines ():
201
+ raw_data = None # (low, high)
202
+ if match := single .match (line ):
203
+ raw_data = (match .group (1 ), match .group (1 ))
204
+ elif match := multiple .match (line ):
205
+ raw_data = (match .group (1 ), match .group (2 ))
206
+ else :
207
+ continue
208
+ low = int (raw_data [0 ], 16 )
209
+ high = int (raw_data [1 ], 16 )
210
+ for cp in range (low , high + 1 ):
211
+ zw_map [cp ] = False
212
+
213
+ # `Default_Ignorable_Code_Point`s also have 0 width
186
214
with fetch_open ("DerivedCoreProperties.txt" ) as properties :
187
215
single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
188
216
multiple = re .compile (
@@ -512,16 +540,18 @@ def main(module_filename: str):
512
540
`module_filename`.
513
541
514
542
We obey the following rules in decreasing order of importance:
515
- - The soft hyphen (`U+00AD`) is single-width.
516
- - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
517
- - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
543
+ - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
544
+ - Hangul Jamo medial vowels & final consonants are zero-width.
545
+ - All `Default_Ignorable_Code_Point`s are zero-width.
546
+ - All codepoints in general categories `Cc`, `Cf`, `Mn`, or `Me` are zero-width,
547
+ except for `Prepended_Concatenation_Mark`s.
518
548
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
519
549
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
520
550
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
521
- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
551
+ of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
522
552
523
- These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
524
- http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c """
553
+ These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
554
+ """
525
555
version = load_unicode_version ()
526
556
print (f"Generating module for Unicode { version [0 ]} .{ version [1 ]} .{ version [2 ]} " )
527
557
0 commit comments