Skip to content

Commit fda272b

Browse files
authored
Merge pull request #34 from Jules-Bertholet/default-ignorable-code-point
Fixes to characters considered zero-width
2 parents 8942487 + aae585f commit fda272b

File tree

5 files changed

+368
-230
lines changed

5 files changed

+368
-230
lines changed

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ fn main() {
2626

2727
**NOTE:** The computed width values may not match the actual rendered column
2828
width. For example, the woman scientist emoji comprises of a woman emoji, a
29-
zero-width joiner and a microscope emoji.
29+
zero-width joiner and a microscope emoji. Such [emoji ZWJ sequences](https://www.unicode.org/reports/tr51/#Emoji_ZWJ_Sequences)
30+
are considered to have the sum of the widths of their constituent parts:
3031

3132
```rust
3233
extern crate unicode_width;
@@ -39,8 +40,10 @@ fn main() {
3940
}
4041
```
4142

42-
See [Unicode Standard Annex #11][UAX11] for precise details on what is and isn't
43-
covered by this crate.
43+
Additionally, [defective combining character sequences](https://unicode.org/glossary/#defective_combining_character_sequence)
44+
and nonstandard [Korean jamo](https://unicode.org/glossary/#jamo) sequences may
45+
be rendered with a different width than what this crate says. (This is not an
46+
exhaustive list.)
4447

4548
## features
4649

scripts/unicode.py

Lines changed: 91 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):
6464

6565
def fetch_open(filename: str):
6666
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67-
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
67+
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
68+
"""
6869
if not os.path.exists(os.path.basename(filename)):
6970
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
7071
try:
@@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":
8384

8485
class EffectiveWidth(enum.IntEnum):
8586
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
86-
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87+
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
88+
"""
8789

8890
ZERO = 0
8991
""" Zero columns wide. """
@@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
146148

147149
def load_zero_widths() -> "list[bool]":
148150
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149-
character. `c` is considered a zero-width character if `c` is in general categories
150-
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
151+
character. `c` is considered a zero-width character if
152+
153+
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
154+
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
155+
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
156+
"""
157+
158+
zw_map = []
159+
160+
# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
151161
with fetch_open("UnicodeData.txt") as categories:
152-
zw_map = []
153162
current = 0
154163
for line in categories.readlines():
155164
if len(raw_data := line.split(";")) != 15:
@@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
159168
raw_data[1],
160169
raw_data[2],
161170
]
162-
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
171+
zero_width = cat_code in ["Cc", "Mn", "Me"]
163172

164173
assert current <= codepoint
165174
while current <= codepoint:
@@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":
176185
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
177186
zw_map.append(False)
178187

179-
return zw_map
188+
# `Default_Ignorable_Code_Point`s also have 0 width:
189+
# https://www.unicode.org/faq/unsup_char.html#3
190+
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
191+
with fetch_open("DerivedCoreProperties.txt") as properties:
192+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
193+
multiple = re.compile(
194+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
195+
)
196+
197+
for line in properties.readlines():
198+
raw_data = None # (low, high)
199+
if match := single.match(line):
200+
raw_data = (match.group(1), match.group(1))
201+
elif match := multiple.match(line):
202+
raw_data = (match.group(1), match.group(2))
203+
else:
204+
continue
205+
low = int(raw_data[0], 16)
206+
high = int(raw_data[1], 16)
207+
for cp in range(low, high + 1):
208+
zw_map[cp] = True
209+
210+
# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
211+
# as zero-width. This matches the behavior of glibc `wcwidth`.
212+
#
213+
# Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
214+
# a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
215+
# into a single wide grapheme. So we treat vowel and trailing jamo as
216+
# 0-width, such that only the width of the leading jamo is counted
217+
# and the resulting grapheme has width 2.
218+
#
219+
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
220+
with fetch_open("HangulSyllableType.txt") as categories:
221+
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
222+
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
223+
224+
for line in categories.readlines():
225+
raw_data = None # (low, high)
226+
if match := single.match(line):
227+
raw_data = (match.group(1), match.group(1))
228+
elif match := multiple.match(line):
229+
raw_data = (match.group(1), match.group(2))
230+
else:
231+
continue
232+
low = int(raw_data[0], 16)
233+
high = int(raw_data[1], 16)
234+
for cp in range(low, high + 1):
235+
zw_map[cp] = True
236+
237+
# Special case: U+115F HANGUL CHOSEONG FILLER.
238+
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
239+
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
240+
# (which are considered 0-width on their own) to form a composed Hangul syllable with
241+
# width 2. Therefore, we treat it as having width 2.
242+
zw_map[0x115F] = False
243+
return zw_map
180244

181245

182246
class Bucket:
183247
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
184-
list overlaps with another's width list, those buckets can be merged via `try_extend`."""
248+
list overlaps with another's width list, those buckets can be merged via `try_extend`.
249+
"""
185250

186251
def __init__(self):
187252
"""Creates an empty bucket."""
@@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
230295
same bucket. Returns a list of the buckets in increasing order of those bits."""
231296
num_bits = cap_bit - low_bit
232297
assert num_bits > 0
233-
buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
298+
buckets = [Bucket() for _ in range(0, 2**num_bits)]
234299
mask = (1 << num_bits) - 1
235-
for (codepoint, width) in entries:
300+
for codepoint, width in entries:
236301
buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
237302
return buckets
238303

@@ -269,7 +334,7 @@ def __init__(
269334
buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
270335

271336
for bucket in buckets:
272-
for (i, existing) in enumerate(self.indexed):
337+
for i, existing in enumerate(self.indexed):
273338
if existing.try_extend(bucket):
274339
self.entries.append(i)
275340
break
@@ -283,7 +348,8 @@ def __init__(
283348

284349
def indices_to_widths(self):
285350
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
286-
their buckets. Assumes that no bucket contains codepoints with different widths."""
351+
their buckets. Assumes that no bucket contains codepoints with different widths.
352+
"""
287353
self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
288354
del self.indexed
289355

@@ -315,7 +381,7 @@ def make_tables(
315381
to include in the top-level table."""
316382
tables = []
317383
entry_groups = [entries]
318-
for (low_bit, cap_bit, offset_type) in table_cfgs:
384+
for low_bit, cap_bit, offset_type in table_cfgs:
319385
table = Table(entry_groups, low_bit, cap_bit, offset_type)
320386
entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
321387
tables.append(table)
@@ -326,7 +392,8 @@ def emit_module(
326392
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
327393
):
328394
"""Outputs a Rust module to `out_name` using table data from `tables`.
329-
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
395+
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
396+
"""
330397
if os.path.exists(out_name):
331398
os.remove(out_name)
332399
with open(out_name, "w", newline="\n", encoding="utf-8") as module:
@@ -432,7 +499,7 @@ def emit_module(
432499
)
433500

434501
subtable_count = 1
435-
for (i, table) in enumerate(tables):
502+
for i, table in enumerate(tables):
436503
new_subtable_count = len(table.buckets())
437504
if i == len(tables) - 1:
438505
table.indices_to_widths() # for the last table, indices == widths
@@ -442,7 +509,7 @@ def emit_module(
442509
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
443510
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
444511
)
445-
for (j, byte) in enumerate(byte_array):
512+
for j, byte in enumerate(byte_array):
446513
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
447514
if j % 15 == 0:
448515
module.write("\n ")
@@ -458,16 +525,17 @@ def main(module_filename: str):
458525
`module_filename`.
459526
460527
We obey the following rules in decreasing order of importance:
461-
- The soft hyphen (`U+00AD`) is single-width.
462-
- Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
463-
- All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
528+
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
529+
- Hangul jamo medial vowels & final consonants are zero-width.
530+
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
531+
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
464532
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
465533
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
466534
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
467-
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
535+
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
468536
469-
These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
470-
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
537+
These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
538+
"""
471539
version = load_unicode_version()
472540
print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
473541

@@ -482,15 +550,11 @@ def main(module_filename: str):
482550
# Override for soft hyphen
483551
width_map[0x00AD] = EffectiveWidth.NARROW
484552

485-
# Override for Hangul Jamo medial vowels & final consonants
486-
for i in range(0x1160, 0x11FF + 1):
487-
width_map[i] = EffectiveWidth.ZERO
488-
489553
tables = make_tables(TABLE_CFGS, enumerate(width_map))
490554

491555
print("------------------------")
492556
total_size = 0
493-
for (i, table) in enumerate(tables):
557+
for i, table in enumerate(tables):
494558
size_bytes = len(table.to_bytes())
495559
print(f"Table {i} Size: {size_bytes} bytes")
496560
total_size += size_bytes

src/lib.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,10 @@
4343
//! ```
4444
4545
#![deny(missing_docs, unsafe_code)]
46-
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
47-
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
48-
46+
#![doc(
47+
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
48+
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
49+
)]
4950
#![cfg_attr(feature = "bench", feature(test))]
5051
#![no_std]
5152

@@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {
8788

8889
impl UnicodeWidthChar for char {
8990
#[inline]
90-
fn width(self) -> Option<usize> { cw::width(self, false) }
91+
fn width(self) -> Option<usize> {
92+
cw::width(self, false)
93+
}
9194

9295
#[inline]
93-
fn width_cjk(self) -> Option<usize> { cw::width(self, true) }
96+
fn width_cjk(self) -> Option<usize> {
97+
cw::width(self, true)
98+
}
9499
}
95100

96101
/// Methods for determining displayed width of Unicode strings.
@@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {
103108
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
104109
/// as 1 column wide. This is consistent with the recommendations for
105110
/// non-CJK contexts, or when the context cannot be reliably determined.
106-
fn width<'a>(&'a self) -> usize;
111+
fn width(&self) -> usize;
107112

108113
/// Returns the string's displayed width in columns.
109114
///
@@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {
113118
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
114119
/// as 2 column wide. This is consistent with the recommendations for
115120
/// CJK contexts.
116-
fn width_cjk<'a>(&'a self) -> usize;
121+
fn width_cjk(&self) -> usize;
117122
}
118123

119124
impl UnicodeWidthStr for str {

0 commit comments

Comments
 (0)