Skip to content

unicode.py: Don't use UnicodeData.txt anymore #39

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 23, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 28 additions & 49 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
# except according to those terms.

# This script uses the following Unicode tables:
#
# - DerivedCoreProperties.txt
# - EastAsianWidth.txt
# - HangulSyllableType.txt
# - PropList.txt
# - ReadMe.txt
# - UnicodeData.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the generated module into git.
Expand Down Expand Up @@ -150,41 +153,20 @@ def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if

- it is in general category `Cc`,
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
- it is a control character,
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

zw_map = []
zw_map = [False] * NUM_CODEPOINTS

# Characters with general category `Cc` have 0 width
with fetch_open("UnicodeData.txt") as categories:
current = 0
for line in categories.readlines():
if len(raw_data := line.split(";")) != 15:
continue
[codepoint, name, cat_code] = [
int(raw_data[0], 16),
raw_data[1],
raw_data[2],
]
zero_width = cat_code == "Cc"

assert current <= codepoint
while current <= codepoint:
if name.endswith(", Last>") or current == codepoint:
# if name ends with Last, we backfill the width value to all codepoints since
# the previous codepoint (aka the start of the range)
zw_map.append(zero_width)
else:
# unassigned characters are implicitly given Neutral width, which is nonzero
zw_map.append(False)
current += 1

while len(zw_map) < NUM_CODEPOINTS:
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
zw_map.append(False)
# Control characters have width 0
for c in range(0x00, 0x20):
zw_map[c] = True
for c in range(0x7F, 0xA0):
zw_map[c] = True

# `Default_Ignorable_Code_Point`s also have 0 width:
# https://www.unicode.org/faq/unsup_char.html#3
Expand Down Expand Up @@ -214,6 +196,12 @@ def load_zero_widths() -> "list[bool]":
for cp in range(low, high + 1):
zw_map[cp] = True

# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
# as they canonically decompose to two characters with this property,
# but they aren't.
for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]:
zw_map[c] = True

# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
# as zero-width. This matches the behavior of glibc `wcwidth`.
#
Expand Down Expand Up @@ -248,18 +236,6 @@ def load_zero_widths() -> "list[bool]":
# width 2. Therefore, we treat it as having width 2.
zw_map[0x115F] = False

# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
# as they canonically decompose to two characters with this property,
# but they aren't.
zw_map[0x0CC0] = True
zw_map[0x0CC7] = True
zw_map[0x0CC8] = True
zw_map[0x0CCA] = True
zw_map[0x0CCB] = True
zw_map[0x1B3B] = True
zw_map[0x1B3D] = True
zw_map[0x1B43] = True

return zw_map


Expand Down Expand Up @@ -297,7 +273,7 @@ def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
result.sort()
return result

def width(self) -> "EffectiveWidth":
def width(self) -> "EffectiveWidth | None":
"""If all codepoints in this bucket have the same width, return that width; otherwise,
return `None`."""
if len(self.widths) == 0:
Expand Down Expand Up @@ -542,13 +518,16 @@ def main(module_filename: str):
lookup table for character width, and write a Rust module utilizing that table to
`module_filename`.

We obey the following rules in decreasing order of importance:
We obey the following rules, in decreasing order of importance:

- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
- Hangul jamo medial vowels & final consonants are zero-width.
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
- Control characters are zero-width.
- `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
are zero-width.
- Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
- Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.

Expand Down