@@ -150,14 +150,15 @@ def load_zero_widths() -> "list[bool]":
150
150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151
151
character. `c` is considered a zero-width character if
152
152
153
- - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
153
+ - it is in general category `Cc`,
154
+ - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
154
155
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155
156
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156
157
"""
157
158
158
159
zw_map = []
159
160
160
- # Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
161
+ # Characters with general category `Cc` have 0 width
161
162
with fetch_open ("UnicodeData.txt" ) as categories :
162
163
current = 0
163
164
for line in categories .readlines ():
@@ -168,7 +169,7 @@ def load_zero_widths() -> "list[bool]":
168
169
raw_data [1 ],
169
170
raw_data [2 ],
170
171
]
171
- zero_width = cat_code in [ "Cc" , "Mn" , "Me" ]
172
+ zero_width = cat_code == "Cc"
172
173
173
174
assert current <= codepoint
174
175
while current <= codepoint :
@@ -188,10 +189,16 @@ def load_zero_widths() -> "list[bool]":
188
189
# `Default_Ignorable_Code_Point`s also have 0 width:
189
190
# https://www.unicode.org/faq/unsup_char.html#3
190
191
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
192
+ #
193
+ # `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
194
+ # as well as a few `Mc` characters that need to be included so that
195
+ # canonically equivalent sequences have the same width.
191
196
with fetch_open ("DerivedCoreProperties.txt" ) as properties :
192
- single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
197
+ single = re .compile (
198
+ r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
199
+ )
193
200
multiple = re .compile (
194
- r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
201
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?: Default_Ignorable_Code_Point|Grapheme_Extend) \s+"
195
202
)
196
203
197
204
for line in properties .readlines ():
@@ -240,6 +247,19 @@ def load_zero_widths() -> "list[bool]":
240
247
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241
248
# width 2. Therefore, we treat it as having width 2.
242
249
zw_map [0x115F ] = False
250
+
251
+ # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
252
+ # as they canonically decompose to two characters with this property,
253
+ # but they aren't.
254
+ zw_map [0x0CC0 ] = True
255
+ zw_map [0x0CC7 ] = True
256
+ zw_map [0x0CC8 ] = True
257
+ zw_map [0x0CCA ] = True
258
+ zw_map [0x0CCB ] = True
259
+ zw_map [0x1B3B ] = True
260
+ zw_map [0x1B3D ] = True
261
+ zw_map [0x1B43 ] = True
262
+
243
263
return zw_map
244
264
245
265
0 commit comments