Skip to content

Commit 397ab07

Browse files
Treat all jungseong and jongseong jamo as 0-width
Fixes #26
1 parent aed33e9 commit 397ab07

File tree

3 files changed

+206
-165
lines changed

3 files changed

+206
-165
lines changed

scripts/unicode.py

Lines changed: 55 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):
6464

6565
def fetch_open(filename: str):
6666
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67-
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
67+
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
68+
"""
6869
if not os.path.exists(os.path.basename(filename)):
6970
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
7071
try:
@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":
8384

8485
class EffectiveWidth(enum.IntEnum):
8586
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
86-
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87+
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
88+
"""
8789

8890
ZERO = 0
8991
""" Zero columns wide. """
@@ -148,10 +150,10 @@ def load_zero_widths() -> "list[bool]":
148150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149151
character. `c` is considered a zero-width character if `c` is in general categories
150152
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
151-
or if it has the `Default_Ignorable_Code_Point` property (determined by fetching
152-
and processing `DerivedCoreProperties.txt`)."""
153+
if it has the `Default_Ignorable_Code_Point` property (determined by fetching
154+
and processing `DerivedCoreProperties.txt`), or if it has a `Hangul_Syllable_Type`
155+
of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`)."""
153156

154-
zw_cat_codes = ["Cc", "Cf", "Mn", "Me"]
155157
zw_map = []
156158

157159
with fetch_open("UnicodeData.txt") as categories:
@@ -164,7 +166,7 @@ def load_zero_widths() -> "list[bool]":
164166
raw_data[1],
165167
raw_data[2],
166168
]
167-
zero_width = cat_code in zw_cat_codes
169+
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
168170

169171
assert current <= codepoint
170172
while current <= codepoint:
@@ -182,30 +184,56 @@ def load_zero_widths() -> "list[bool]":
182184
zw_map.append(False)
183185

184186
with fetch_open("DerivedCoreProperties.txt") as properties:
185-
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)")
186-
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)")
187+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
188+
multiple = re.compile(
189+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
190+
)
187191

188192
for line in properties.readlines():
189-
raw_data = None # (low, high, category)
193+
raw_data = None # (low, high)
190194
if match := single.match(line):
191-
raw_data = (match.group(1), match.group(1), match.group(2))
195+
raw_data = (match.group(1), match.group(1))
192196
elif match := multiple.match(line):
193-
raw_data = (match.group(1), match.group(2), match.group(3))
197+
raw_data = (match.group(1), match.group(2))
198+
else:
199+
continue
200+
low = int(raw_data[0], 16)
201+
high = int(raw_data[1], 16)
202+
for cp in range(low, high + 1):
203+
zw_map[cp] = True
204+
205+
# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
206+
# as zero-width. This matches the behavior of glibc `wcwidth`.
207+
#
208+
# Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
209+
# a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
210+
# into a single wide grapheme. So we treat vowel and trailing jamo as
211+
# 0-width, such that only the width of the leading jamo is counted
212+
# and the resulting grapheme has width 2.
213+
with fetch_open("HangulSyllableType.txt") as categories:
214+
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
215+
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
216+
217+
for line in categories.readlines():
218+
raw_data = None # (low, high)
219+
if match := single.match(line):
220+
raw_data = (match.group(1), match.group(1))
221+
elif match := multiple.match(line):
222+
raw_data = (match.group(1), match.group(2))
194223
else:
195224
continue
196225
low = int(raw_data[0], 16)
197226
high = int(raw_data[1], 16)
198-
cat = raw_data[2]
199-
if cat not in zw_cat_codes:
200-
for cp in range(low, high + 1):
201-
zw_map[cp] = True
227+
for cp in range(low, high + 1):
228+
zw_map[cp] = True
202229

203230
return zw_map
204231

205232

206233
class Bucket:
207234
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
208-
list overlaps with another's width list, those buckets can be merged via `try_extend`."""
235+
list overlaps with another's width list, those buckets can be merged via `try_extend`.
236+
"""
209237

210238
def __init__(self):
211239
"""Creates an empty bucket."""
@@ -254,9 +282,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
254282
same bucket. Returns a list of the buckets in increasing order of those bits."""
255283
num_bits = cap_bit - low_bit
256284
assert num_bits > 0
257-
buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
285+
buckets = [Bucket() for _ in range(0, 2**num_bits)]
258286
mask = (1 << num_bits) - 1
259-
for (codepoint, width) in entries:
287+
for codepoint, width in entries:
260288
buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
261289
return buckets
262290

@@ -293,7 +321,7 @@ def __init__(
293321
buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
294322

295323
for bucket in buckets:
296-
for (i, existing) in enumerate(self.indexed):
324+
for i, existing in enumerate(self.indexed):
297325
if existing.try_extend(bucket):
298326
self.entries.append(i)
299327
break
@@ -307,7 +335,8 @@ def __init__(
307335

308336
def indices_to_widths(self):
309337
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
310-
their buckets. Assumes that no bucket contains codepoints with different widths."""
338+
their buckets. Assumes that no bucket contains codepoints with different widths.
339+
"""
311340
self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
312341
del self.indexed
313342

@@ -339,7 +368,7 @@ def make_tables(
339368
to include in the top-level table."""
340369
tables = []
341370
entry_groups = [entries]
342-
for (low_bit, cap_bit, offset_type) in table_cfgs:
371+
for low_bit, cap_bit, offset_type in table_cfgs:
343372
table = Table(entry_groups, low_bit, cap_bit, offset_type)
344373
entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
345374
tables.append(table)
@@ -350,7 +379,8 @@ def emit_module(
350379
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
351380
):
352381
"""Outputs a Rust module to `out_name` using table data from `tables`.
353-
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
382+
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
383+
"""
354384
if os.path.exists(out_name):
355385
os.remove(out_name)
356386
with open(out_name, "w", newline="\n", encoding="utf-8") as module:
@@ -456,7 +486,7 @@ def emit_module(
456486
)
457487

458488
subtable_count = 1
459-
for (i, table) in enumerate(tables):
489+
for i, table in enumerate(tables):
460490
new_subtable_count = len(table.buckets())
461491
if i == len(tables) - 1:
462492
table.indices_to_widths() # for the last table, indices == widths
@@ -466,7 +496,7 @@ def emit_module(
466496
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
467497
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
468498
)
469-
for (j, byte) in enumerate(byte_array):
499+
for j, byte in enumerate(byte_array):
470500
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
471501
if j % 15 == 0:
472502
module.write("\n ")
@@ -506,15 +536,11 @@ def main(module_filename: str):
506536
# Override for soft hyphen
507537
width_map[0x00AD] = EffectiveWidth.NARROW
508538

509-
# Override for Hangul Jamo medial vowels & final consonants
510-
for i in range(0x1160, 0x11FF + 1):
511-
width_map[i] = EffectiveWidth.ZERO
512-
513539
tables = make_tables(TABLE_CFGS, enumerate(width_map))
514540

515541
print("------------------------")
516542
total_size = 0
517-
for (i, table) in enumerate(tables):
543+
for i, table in enumerate(tables):
518544
size_bytes = len(table.to_bytes())
519545
print(f"Table {i} Size: {size_bytes} bytes")
520546
total_size += size_bytes

0 commit comments

Comments
 (0)