@@ -147,9 +147,14 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
147
147
def load_zero_widths () -> "list[bool]" :
148
148
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149
149
character. `c` is considered a zero-width character if `c` is in general categories
150
- `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
150
+ `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
151
+ or if it has the `Default_Ignorable_Code_Point` property (determined by fetching
152
+ and processing `DerivedCoreProperties.txt`)."""
153
+
154
+ zw_cat_codes = ["Cc" , "Cf" , "Mn" , "Me" ]
155
+ zw_map = []
156
+
151
157
with fetch_open ("UnicodeData.txt" ) as categories :
152
- zw_map = []
153
158
current = 0
154
159
for line in categories .readlines ():
155
160
if len (raw_data := line .split (";" )) != 15 :
@@ -159,7 +164,7 @@ def load_zero_widths() -> "list[bool]":
159
164
raw_data [1 ],
160
165
raw_data [2 ],
161
166
]
162
- zero_width = cat_code in [ "Cc" , "Cf" , "Mn" , "Me" ]
167
+ zero_width = cat_code in zw_cat_codes
163
168
164
169
assert current <= codepoint
165
170
while current <= codepoint :
@@ -176,7 +181,26 @@ def load_zero_widths() -> "list[bool]":
176
181
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
177
182
zw_map .append (False )
178
183
179
- return zw_map
184
+ with fetch_open ("DerivedCoreProperties.txt" ) as properties :
185
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)" )
186
+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)" )
187
+
188
+ for line in properties .readlines ():
189
+ raw_data = None # (low, high, category)
190
+ if match := single .match (line ):
191
+ raw_data = (match .group (1 ), match .group (1 ), match .group (2 ))
192
+ elif match := multiple .match (line ):
193
+ raw_data = (match .group (1 ), match .group (2 ), match .group (3 ))
194
+ else :
195
+ continue
196
+ low = int (raw_data [0 ], 16 )
197
+ high = int (raw_data [1 ], 16 )
198
+ cat = raw_data [2 ]
199
+ if cat not in zw_cat_codes :
200
+ for cp in range (low , high + 1 ):
201
+ zw_map [cp ] = True
202
+
203
+ return zw_map
180
204
181
205
182
206
class Bucket :
0 commit comments