@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):
64
64
65
65
def fetch_open (filename : str ):
66
66
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67
- fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
67
+ fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
68
+ """
68
69
if not os .path .exists (os .path .basename (filename )):
69
70
os .system (f"curl -O http://www.unicode.org/Public/UNIDATA/{ filename } " )
70
71
try :
@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":
83
84
84
85
class EffectiveWidth (enum .IntEnum ):
85
86
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
86
- either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87
+ either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
88
+ """
87
89
88
90
ZERO = 0
89
91
""" Zero columns wide. """
@@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
146
148
147
149
def load_zero_widths () -> "list[bool]" :
148
150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149
- character. `c` is considered a zero-width character if `c` is in general categories
150
- `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
151
+ character. `c` is considered a zero-width character if
152
+
153
+ - it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154
+ - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155
+ - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156
+ """
157
+
158
+ zw_map = []
159
+
160
+ # Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
151
161
with fetch_open ("UnicodeData.txt" ) as categories :
152
- zw_map = []
153
162
current = 0
154
163
for line in categories .readlines ():
155
164
if len (raw_data := line .split (";" )) != 15 :
@@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
159
168
raw_data [1 ],
160
169
raw_data [2 ],
161
170
]
162
- zero_width = cat_code in ["Cc" , "Cf" , " Mn" , "Me" ]
171
+ zero_width = cat_code in ["Cc" , "Mn" , "Me" ]
163
172
164
173
assert current <= codepoint
165
174
while current <= codepoint :
@@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":
176
185
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
177
186
zw_map .append (False )
178
187
179
- return zw_map
188
+ # `Default_Ignorable_Code_Point`s also have 0 width:
189
+ # https://www.unicode.org/faq/unsup_char.html#3
190
+ # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
191
+ with fetch_open ("DerivedCoreProperties.txt" ) as properties :
192
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
193
+ multiple = re .compile (
194
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
195
+ )
196
+
197
+ for line in properties .readlines ():
198
+ raw_data = None # (low, high)
199
+ if match := single .match (line ):
200
+ raw_data = (match .group (1 ), match .group (1 ))
201
+ elif match := multiple .match (line ):
202
+ raw_data = (match .group (1 ), match .group (2 ))
203
+ else :
204
+ continue
205
+ low = int (raw_data [0 ], 16 )
206
+ high = int (raw_data [1 ], 16 )
207
+ for cp in range (low , high + 1 ):
208
+ zw_map [cp ] = True
209
+
210
+ # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
211
+ # as zero-width. This matches the behavior of glibc `wcwidth`.
212
+ #
213
+ # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
214
+ # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
215
+ # into a single wide grapheme. So we treat vowel and trailing jamo as
216
+ # 0-width, such that only the width of the leading jamo is counted
217
+ # and the resulting grapheme has width 2.
218
+ #
219
+ # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220
+ with fetch_open ("HangulSyllableType.txt" ) as categories :
221
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+(V|T)\s+" )
222
+ multiple = re .compile (r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+" )
223
+
224
+ for line in categories .readlines ():
225
+ raw_data = None # (low, high)
226
+ if match := single .match (line ):
227
+ raw_data = (match .group (1 ), match .group (1 ))
228
+ elif match := multiple .match (line ):
229
+ raw_data = (match .group (1 ), match .group (2 ))
230
+ else :
231
+ continue
232
+ low = int (raw_data [0 ], 16 )
233
+ high = int (raw_data [1 ], 16 )
234
+ for cp in range (low , high + 1 ):
235
+ zw_map [cp ] = True
236
+
237
+ # Special case: U+115F HANGUL CHOSEONG FILLER.
238
+ # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239
+ # zero width. However, the expected usage is to combine it with vowel or trailing jamo
240
+ # (which are considered 0-width on their own) to form a composed Hangul syllable with
241
+ # width 2. Therefore, we treat it as having width 2.
242
+ zw_map [0x115F ] = False
243
+ return zw_map
180
244
181
245
182
246
class Bucket :
183
247
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
184
- list overlaps with another's width list, those buckets can be merged via `try_extend`."""
248
+ list overlaps with another's width list, those buckets can be merged via `try_extend`.
249
+ """
185
250
186
251
def __init__ (self ):
187
252
"""Creates an empty bucket."""
@@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
230
295
same bucket. Returns a list of the buckets in increasing order of those bits."""
231
296
num_bits = cap_bit - low_bit
232
297
assert num_bits > 0
233
- buckets = [Bucket () for _ in range (0 , 2 ** num_bits )]
298
+ buckets = [Bucket () for _ in range (0 , 2 ** num_bits )]
234
299
mask = (1 << num_bits ) - 1
235
- for ( codepoint , width ) in entries :
300
+ for codepoint , width in entries :
236
301
buckets [(codepoint >> low_bit ) & mask ].append (codepoint , width )
237
302
return buckets
238
303
@@ -269,7 +334,7 @@ def __init__(
269
334
buckets .extend (make_buckets (entries , self .low_bit , self .cap_bit ))
270
335
271
336
for bucket in buckets :
272
- for ( i , existing ) in enumerate (self .indexed ):
337
+ for i , existing in enumerate (self .indexed ):
273
338
if existing .try_extend (bucket ):
274
339
self .entries .append (i )
275
340
break
@@ -283,7 +348,8 @@ def __init__(
283
348
284
349
def indices_to_widths (self ):
285
350
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
286
- their buckets. Assumes that no bucket contains codepoints with different widths."""
351
+ their buckets. Assumes that no bucket contains codepoints with different widths.
352
+ """
287
353
self .entries = list (map (lambda i : int (self .indexed [i ].width ()), self .entries ))
288
354
del self .indexed
289
355
@@ -315,7 +381,7 @@ def make_tables(
315
381
to include in the top-level table."""
316
382
tables = []
317
383
entry_groups = [entries ]
318
- for ( low_bit , cap_bit , offset_type ) in table_cfgs :
384
+ for low_bit , cap_bit , offset_type in table_cfgs :
319
385
table = Table (entry_groups , low_bit , cap_bit , offset_type )
320
386
entry_groups = map (lambda bucket : bucket .entries (), table .buckets ())
321
387
tables .append (table )
@@ -326,7 +392,8 @@ def emit_module(
326
392
out_name : str , unicode_version : "tuple[int, int, int]" , tables : "list[Table]"
327
393
):
328
394
"""Outputs a Rust module to `out_name` using table data from `tables`.
329
- If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
395
+ If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
396
+ """
330
397
if os .path .exists (out_name ):
331
398
os .remove (out_name )
332
399
with open (out_name , "w" , newline = "\n " , encoding = "utf-8" ) as module :
@@ -432,7 +499,7 @@ def emit_module(
432
499
)
433
500
434
501
subtable_count = 1
435
- for ( i , table ) in enumerate (tables ):
502
+ for i , table in enumerate (tables ):
436
503
new_subtable_count = len (table .buckets ())
437
504
if i == len (tables ) - 1 :
438
505
table .indices_to_widths () # for the last table, indices == widths
@@ -442,7 +509,7 @@ def emit_module(
442
509
/// Autogenerated. { subtable_count } sub-table(s). Consult [`lookup_width`] for layout info.
443
510
static TABLES_{ i } : [u8; { len (byte_array )} ] = ["""
444
511
)
445
- for ( j , byte ) in enumerate (byte_array ):
512
+ for j , byte in enumerate (byte_array ):
446
513
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
447
514
if j % 15 == 0 :
448
515
module .write ("\n " )
@@ -458,16 +525,17 @@ def main(module_filename: str):
458
525
`module_filename`.
459
526
460
527
We obey the following rules in decreasing order of importance:
461
- - The soft hyphen (`U+00AD`) is single-width.
462
- - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
463
- - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
528
+ - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
529
+ - Hangul jamo medial vowels & final consonants are zero-width.
530
+ - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
531
+ - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
464
532
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
465
533
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
466
534
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
467
- of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
535
+ of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
468
536
469
- These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
470
- http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c """
537
+ These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
538
+ """
471
539
version = load_unicode_version ()
472
540
print (f"Generating module for Unicode { version [0 ]} .{ version [1 ]} .{ version [2 ]} " )
473
541
@@ -482,15 +550,11 @@ def main(module_filename: str):
482
550
# Override for soft hyphen
483
551
width_map [0x00AD ] = EffectiveWidth .NARROW
484
552
485
- # Override for Hangul Jamo medial vowels & final consonants
486
- for i in range (0x1160 , 0x11FF + 1 ):
487
- width_map [i ] = EffectiveWidth .ZERO
488
-
489
553
tables = make_tables (TABLE_CFGS , enumerate (width_map ))
490
554
491
555
print ("------------------------" )
492
556
total_size = 0
493
- for ( i , table ) in enumerate (tables ):
557
+ for i , table in enumerate (tables ):
494
558
size_bytes = len (table .to_bytes ())
495
559
print (f"Table { i } Size: { size_bytes } bytes" )
496
560
total_size += size_bytes
0 commit comments