Skip to content

Commit 3885393

Browse files
authored
Merge pull request #39 from Jules-Bertholet/no-more-unicodedata
`unicode.py`: Don't use `UnicodeData.txt` anymore
2 parents afd136a + 787fed3 commit 3885393

File tree

1 file changed

+28
-49
lines changed

1 file changed

+28
-49
lines changed

scripts/unicode.py

Lines changed: 28 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
# except according to those terms.
1212

1313
# This script uses the following Unicode tables:
14+
#
15+
# - DerivedCoreProperties.txt
1416
# - EastAsianWidth.txt
17+
# - HangulSyllableType.txt
18+
# - PropList.txt
1519
# - ReadMe.txt
16-
# - UnicodeData.txt
1720
#
1821
# Since this should not require frequent updates, we just store this
1922
# out-of-line and check the generated module into git.
@@ -150,41 +153,20 @@ def load_zero_widths() -> "list[bool]":
150153
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
151154
character. `c` is considered a zero-width character if
152155
153-
- it is in general category `Cc`,
154-
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
156+
- it is a control character,
155157
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
158+
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
159+
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
156160
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
157161
"""
158162

159-
zw_map = []
163+
zw_map = [False] * NUM_CODEPOINTS
160164

161-
# Characters with general category `Cc` have 0 width
162-
with fetch_open("UnicodeData.txt") as categories:
163-
current = 0
164-
for line in categories.readlines():
165-
if len(raw_data := line.split(";")) != 15:
166-
continue
167-
[codepoint, name, cat_code] = [
168-
int(raw_data[0], 16),
169-
raw_data[1],
170-
raw_data[2],
171-
]
172-
zero_width = cat_code == "Cc"
173-
174-
assert current <= codepoint
175-
while current <= codepoint:
176-
if name.endswith(", Last>") or current == codepoint:
177-
# if name ends with Last, we backfill the width value to all codepoints since
178-
# the previous codepoint (aka the start of the range)
179-
zw_map.append(zero_width)
180-
else:
181-
# unassigned characters are implicitly given Neutral width, which is nonzero
182-
zw_map.append(False)
183-
current += 1
184-
185-
while len(zw_map) < NUM_CODEPOINTS:
186-
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
187-
zw_map.append(False)
165+
# Control characters have width 0
166+
for c in range(0x00, 0x20):
167+
zw_map[c] = True
168+
for c in range(0x7F, 0xA0):
169+
zw_map[c] = True
188170

189171
# `Default_Ignorable_Code_Point`s also have 0 width:
190172
# https://www.unicode.org/faq/unsup_char.html#3
@@ -214,6 +196,12 @@ def load_zero_widths() -> "list[bool]":
214196
for cp in range(low, high + 1):
215197
zw_map[cp] = True
216198

199+
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
200+
# as they canonically decompose to two characters with this property,
201+
# but they aren't.
202+
for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]:
203+
zw_map[c] = True
204+
217205
# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
218206
# as zero-width. This matches the behavior of glibc `wcwidth`.
219207
#
@@ -248,18 +236,6 @@ def load_zero_widths() -> "list[bool]":
248236
# width 2. Therefore, we treat it as having width 2.
249237
zw_map[0x115F] = False
250238

251-
# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
252-
# as they canonically decompose to two characters with this property,
253-
# but they aren't.
254-
zw_map[0x0CC0] = True
255-
zw_map[0x0CC7] = True
256-
zw_map[0x0CC8] = True
257-
zw_map[0x0CCA] = True
258-
zw_map[0x0CCB] = True
259-
zw_map[0x1B3B] = True
260-
zw_map[0x1B3D] = True
261-
zw_map[0x1B43] = True
262-
263239
return zw_map
264240

265241

@@ -297,7 +273,7 @@ def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
297273
result.sort()
298274
return result
299275

300-
def width(self) -> "EffectiveWidth":
276+
def width(self) -> "EffectiveWidth | None":
301277
"""If all codepoints in this bucket have the same width, return that width; otherwise,
302278
return `None`."""
303279
if len(self.widths) == 0:
@@ -542,13 +518,16 @@ def main(module_filename: str):
542518
lookup table for character width, and write a Rust module utilizing that table to
543519
`module_filename`.
544520
545-
We obey the following rules in decreasing order of importance:
521+
We obey the following rules, in decreasing order of importance:
522+
546523
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
547524
- Hangul jamo medial vowels & final consonants are zero-width.
548-
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
549-
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
550-
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
551-
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
525+
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
526+
- Control characters are zero-width.
527+
- `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters,
528+
are zero-width.
529+
- Codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
530+
- Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
552531
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
553532
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
554533

0 commit comments

Comments
 (0)