@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):
64
64
65
65
def fetch_open (filename : str ):
66
66
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67
- fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
67
+ fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
68
+ """
68
69
if not os .path .exists (os .path .basename (filename )):
69
70
os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
70
71
try :
@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":
83
84
84
85
class EffectiveWidth (enum .IntEnum ):
85
86
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
86
- either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87
+ either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
88
+ """
87
89
88
90
ZERO = 0
89
91
""" Zero columns wide. """
@@ -148,10 +150,10 @@ def load_zero_widths() -> "list[bool]":
148
150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149
151
character. `c` is considered a zero-width character if `c` is in general categories
150
152
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
151
- or if it has the `Default_Ignorable_Code_Point` property (determined by fetching
152
- and processing `DerivedCoreProperties.txt`)."""
153
+ if it has the `Default_Ignorable_Code_Point` property (determined by fetching
154
+ and processing `DerivedCoreProperties.txt`), or if it has a `Hangul_Syllable_Type`
155
+ of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`)."""
153
156
154
- zw_cat_codes = ["Cc" , "Cf" , "Mn" , "Me" ]
155
157
zw_map = []
156
158
157
159
with fetch_open ("UnicodeData.txt" ) as categories :
@@ -164,7 +166,7 @@ def load_zero_widths() -> "list[bool]":
164
166
raw_data [1 ],
165
167
raw_data [2 ],
166
168
]
167
- zero_width = cat_code in zw_cat_codes
169
+ zero_width = cat_code in [ "Cc" , "Cf" , "Mn" , "Me" ]
168
170
169
171
assert current <= codepoint
170
172
while current <= codepoint :
@@ -182,30 +184,56 @@ def load_zero_widths() -> "list[bool]":
182
184
zw_map .append (False )
183
185
184
186
with fetch_open ("DerivedCoreProperties.txt" ) as properties :
185
- single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)" )
186
- multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)" )
187
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
188
+ multiple = re .compile (
189
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
190
+ )
187
191
188
192
for line in properties .readlines ():
189
- raw_data = None # (low, high, category )
193
+ raw_data = None # (low, high)
190
194
if match := single .match (line ):
191
- raw_data = (match .group (1 ), match .group (1 ), match . group ( 2 ) )
195
+ raw_data = (match .group (1 ), match .group (1 ))
192
196
elif match := multiple .match (line ):
193
- raw_data = (match .group (1 ), match .group (2 ), match .group (3 ))
197
+ raw_data = (match .group (1 ), match .group (2 ))
198
+ else :
199
+ continue
200
+ low = int (raw_data [0 ], 16 )
201
+ high = int (raw_data [1 ], 16 )
202
+ for cp in range (low , high + 1 ):
203
+ zw_map [cp ] = True
204
+
205
+ # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
206
+ # as zero-width. This matches the behavior of glibc `wcwidth`.
207
+ #
208
+ # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
209
+ # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
210
+ # into a single wide grapheme. So we treat vowel and trailing jamo as
211
+ # 0-width, such that only the width of the leading jamo is counted
212
+ # and the resulting grapheme has width 2.
213
+ with fetch_open ("HangulSyllableType.txt" ) as categories :
214
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+(V|T)\s+" )
215
+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+" )
216
+
217
+ for line in categories .readlines ():
218
+ raw_data = None # (low, high)
219
+ if match := single .match (line ):
220
+ raw_data = (match .group (1 ), match .group (1 ))
221
+ elif match := multiple .match (line ):
222
+ raw_data = (match .group (1 ), match .group (2 ))
194
223
else :
195
224
continue
196
225
low = int (raw_data [0 ], 16 )
197
226
high = int (raw_data [1 ], 16 )
198
- cat = raw_data [2 ]
199
- if cat not in zw_cat_codes :
200
- for cp in range (low , high + 1 ):
201
- zw_map [cp ] = True
227
+ for cp in range (low , high + 1 ):
228
+ zw_map [cp ] = True
202
229
203
230
return zw_map
204
231
205
232
206
233
class Bucket :
207
234
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
208
- list overlaps with another's width list, those buckets can be merged via `try_extend`."""
235
+ list overlaps with another's width list, those buckets can be merged via `try_extend`.
236
+ """
209
237
210
238
def __init__ (self ):
211
239
"""Creates an empty bucket."""
@@ -254,9 +282,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
254
282
same bucket. Returns a list of the buckets in increasing order of those bits."""
255
283
num_bits = cap_bit - low_bit
256
284
assert num_bits > 0
257
- buckets = [Bucket () for _ in range (0 , 2 ** num_bits )]
285
+ buckets = [Bucket () for _ in range (0 , 2 ** num_bits )]
258
286
mask = (1 << num_bits ) - 1
259
- for ( codepoint , width ) in entries :
287
+ for codepoint , width in entries :
260
288
buckets [(codepoint >> low_bit ) & mask ].append (codepoint , width )
261
289
return buckets
262
290
@@ -293,7 +321,7 @@ def __init__(
293
321
buckets .extend (make_buckets (entries , self .low_bit , self .cap_bit ))
294
322
295
323
for bucket in buckets :
296
- for ( i , existing ) in enumerate (self .indexed ):
324
+ for i , existing in enumerate (self .indexed ):
297
325
if existing .try_extend (bucket ):
298
326
self .entries .append (i )
299
327
break
@@ -307,7 +335,8 @@ def __init__(
307
335
308
336
def indices_to_widths (self ):
309
337
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
310
- their buckets. Assumes that no bucket contains codepoints with different widths."""
338
+ their buckets. Assumes that no bucket contains codepoints with different widths.
339
+ """
311
340
self .entries = list (map (lambda i : int (self .indexed [i ].width ()), self .entries ))
312
341
del self .indexed
313
342
@@ -339,7 +368,7 @@ def make_tables(
339
368
to include in the top-level table."""
340
369
tables = []
341
370
entry_groups = [entries ]
342
- for ( low_bit , cap_bit , offset_type ) in table_cfgs :
371
+ for low_bit , cap_bit , offset_type in table_cfgs :
343
372
table = Table (entry_groups , low_bit , cap_bit , offset_type )
344
373
entry_groups = map (lambda bucket : bucket .entries (), table .buckets ())
345
374
tables .append (table )
@@ -350,7 +379,8 @@ def emit_module(
350
379
out_name : str , unicode_version : "tuple[int, int, int]" , tables : "list[Table]"
351
380
):
352
381
"""Outputs a Rust module to `out_name` using table data from `tables`.
353
- If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
382
+ If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
383
+ """
354
384
if os .path .exists (out_name ):
355
385
os .remove (out_name )
356
386
with open (out_name , "w" , newline = "\n " , encoding = "utf-8" ) as module :
@@ -456,7 +486,7 @@ def emit_module(
456
486
)
457
487
458
488
subtable_count = 1
459
- for ( i , table ) in enumerate (tables ):
489
+ for i , table in enumerate (tables ):
460
490
new_subtable_count = len (table .buckets ())
461
491
if i == len (tables ) - 1 :
462
492
table .indices_to_widths () # for the last table, indices == widths
@@ -466,7 +496,7 @@ def emit_module(
466
496
/// Autogenerated. { subtable_count } sub-table(s). Consult [`lookup_width`] for layout info.
467
497
static TABLES_{ i } : [u8; { len (byte_array )} ] = ["""
468
498
)
469
- for ( j , byte ) in enumerate (byte_array ):
499
+ for j , byte in enumerate (byte_array ):
470
500
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
471
501
if j % 15 == 0 :
472
502
module .write ("\n " )
@@ -506,15 +536,11 @@ def main(module_filename: str):
506
536
# Override for soft hyphen
507
537
width_map [0x00AD ] = EffectiveWidth .NARROW
508
538
509
- # Override for Hangul Jamo medial vowels & final consonants
510
- for i in range (0x1160 , 0x11FF + 1 ):
511
- width_map [i ] = EffectiveWidth .ZERO
512
-
513
539
tables = make_tables (TABLE_CFGS , enumerate (width_map ))
514
540
515
541
print ("------------------------" )
516
542
total_size = 0
517
- for ( i , table ) in enumerate (tables ):
543
+ for i , table in enumerate (tables ):
518
544
size_bytes = len (table .to_bytes ())
519
545
print (f"Table { i } Size: { size_bytes } bytes" )
520
546
total_size += size_bytes
0 commit comments