|
11 | 11 | # except according to those terms.
|
12 | 12 |
|
13 | 13 | # This script uses the following Unicode tables:
|
| 14 | +# |
| 15 | +# - DerivedCoreProperties.txt |
14 | 16 | # - EastAsianWidth.txt
|
| 17 | +# - HangulSyllableType.txt |
| 18 | +# - PropList.txt |
15 | 19 | # - ReadMe.txt
|
16 |
| -# - UnicodeData.txt |
17 | 20 | #
|
18 | 21 | # Since this should not require frequent updates, we just store this
|
19 | 22 | # out-of-line and check the generated module into git.
|
@@ -150,41 +153,20 @@ def load_zero_widths() -> "list[bool]":
|
150 | 153 | """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
|
151 | 154 | character. `c` is considered a zero-width character if
|
152 | 155 |
|
153 |
| - - it is in general category `Cc`, |
154 |
| - - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), |
| 156 | + - it is a control character, |
155 | 157 | - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
|
| 158 | + - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), |
| 159 | + - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug, |
156 | 160 | - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
|
157 | 161 | """
|
158 | 162 |
|
159 |
| - zw_map = [] |
| 163 | + zw_map = [False] * NUM_CODEPOINTS |
160 | 164 |
|
161 |
| - # Characters with general category `Cc` have 0 width |
162 |
| - with fetch_open("UnicodeData.txt") as categories: |
163 |
| - current = 0 |
164 |
| - for line in categories.readlines(): |
165 |
| - if len(raw_data := line.split(";")) != 15: |
166 |
| - continue |
167 |
| - [codepoint, name, cat_code] = [ |
168 |
| - int(raw_data[0], 16), |
169 |
| - raw_data[1], |
170 |
| - raw_data[2], |
171 |
| - ] |
172 |
| - zero_width = cat_code == "Cc" |
173 |
| - |
174 |
| - assert current <= codepoint |
175 |
| - while current <= codepoint: |
176 |
| - if name.endswith(", Last>") or current == codepoint: |
177 |
| - # if name ends with Last, we backfill the width value to all codepoints since |
178 |
| - # the previous codepoint (aka the start of the range) |
179 |
| - zw_map.append(zero_width) |
180 |
| - else: |
181 |
| - # unassigned characters are implicitly given Neutral width, which is nonzero |
182 |
| - zw_map.append(False) |
183 |
| - current += 1 |
184 |
| - |
185 |
| - while len(zw_map) < NUM_CODEPOINTS: |
186 |
| - # Catch any leftover codepoints. They must be unassigned (so nonzero width) |
187 |
| - zw_map.append(False) |
| 165 | + # Control characters have width 0 |
| 166 | + for c in range(0x00, 0x20): |
| 167 | + zw_map[c] = True |
| 168 | + for c in range(0x7F, 0xA0): |
| 169 | + zw_map[c] = True |
188 | 170 |
|
189 | 171 | # `Default_Ignorable_Code_Point`s also have 0 width:
|
190 | 172 | # https://www.unicode.org/faq/unsup_char.html#3
|
@@ -214,6 +196,12 @@ def load_zero_widths() -> "list[bool]":
|
214 | 196 | for cp in range(low, high + 1):
|
215 | 197 | zw_map[cp] = True
|
216 | 198 |
|
| 199 | + # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, |
| 200 | + # as they canonically decompose to two characters with this property, |
| 201 | + # but they aren't. |
| 202 | + for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]: |
| 203 | + zw_map[c] = True |
| 204 | + |
217 | 205 | # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
|
218 | 206 | # as zero-width. This matches the behavior of glibc `wcwidth`.
|
219 | 207 | #
|
@@ -248,18 +236,6 @@ def load_zero_widths() -> "list[bool]":
|
248 | 236 | # width 2. Therefore, we treat it as having width 2.
|
249 | 237 | zw_map[0x115F] = False
|
250 | 238 |
|
251 |
| - # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, |
252 |
| - # as they canonically decompose to two characters with this property, |
253 |
| - # but they aren't. |
254 |
| - zw_map[0x0CC0] = True |
255 |
| - zw_map[0x0CC7] = True |
256 |
| - zw_map[0x0CC8] = True |
257 |
| - zw_map[0x0CCA] = True |
258 |
| - zw_map[0x0CCB] = True |
259 |
| - zw_map[0x1B3B] = True |
260 |
| - zw_map[0x1B3D] = True |
261 |
| - zw_map[0x1B43] = True |
262 |
| - |
263 | 239 | return zw_map
|
264 | 240 |
|
265 | 241 |
|
@@ -297,7 +273,7 @@ def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
|
297 | 273 | result.sort()
|
298 | 274 | return result
|
299 | 275 |
|
300 |
| - def width(self) -> "EffectiveWidth": |
| 276 | + def width(self) -> "EffectiveWidth | None": |
301 | 277 | """If all codepoints in this bucket have the same width, return that width; otherwise,
|
302 | 278 | return `None`."""
|
303 | 279 | if len(self.widths) == 0:
|
@@ -542,13 +518,16 @@ def main(module_filename: str):
|
542 | 518 | lookup table for character width, and write a Rust module utilizing that table to
|
543 | 519 | `module_filename`.
|
544 | 520 |
|
545 |
| - We obey the following rules in decreasing order of importance: |
| 521 | + We obey the following rules, in decreasing order of importance: |
| 522 | +
|
546 | 523 | - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
|
547 | 524 | - Hangul jamo medial vowels & final consonants are zero-width.
|
548 |
| - - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. |
549 |
| - - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width. |
550 |
| - - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. |
551 |
| - - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. |
| 525 | + - `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. |
| 526 | + - Control characters are zero-width. |
| 527 | + - `Grapheme_Extend` chracters, as well as eight characters that NFD decompose to `Grapheme_Extend` chracters, |
| 528 | + are zero-width. |
| 529 | + - Codepoints with an East Asian Width of `Ambigous` are ambiguous-width. |
| 530 | + - Codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. |
552 | 531 | - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
|
553 | 532 | of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
|
554 | 533 |
|
|
0 commit comments