Skip to content

Fixes to characters considered zero-width #34

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ fn main() {

**NOTE:** The computed width values may not match the actual rendered column
width. For example, the woman scientist emoji comprises of a woman emoji, a
zero-width joiner and a microscope emoji.
zero-width joiner and a microscope emoji. Such [emoji ZWJ sequences](https://www.unicode.org/reports/tr51/#Emoji_ZWJ_Sequences)
are considered to have the sum of the widths of their constituent parts:

```rust
extern crate unicode_width;
Expand All @@ -39,8 +40,10 @@ fn main() {
}
```

See [Unicode Standard Annex #11][UAX11] for precise details on what is and isn't
covered by this crate.
Additionally, [defective combining character sequences](https://unicode.org/glossary/#defective_combining_character_sequence)
and nonstandard [Korean jamo](https://unicode.org/glossary/#jamo) sequences may
be rendered with a different width than what this crate says. (This is not an
exhaustive list.)

## features

Expand Down
118 changes: 91 additions & 27 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ class OffsetType(enum.IntEnum):

def fetch_open(filename: str):
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
"""
if not os.path.exists(os.path.basename(filename)):
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
try:
Expand All @@ -83,7 +84,8 @@ def load_unicode_version() -> "tuple[int, int, int]":

class EffectiveWidth(enum.IntEnum):
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
"""

ZERO = 0
""" Zero columns wide. """
Expand Down Expand Up @@ -146,10 +148,17 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":

def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if `c` is in general categories
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
character. `c` is considered a zero-width character if

- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

zw_map = []

# Characters with general category `Cc`, `Mn`, or `Me` have 0 width...
with fetch_open("UnicodeData.txt") as categories:
zw_map = []
current = 0
for line in categories.readlines():
if len(raw_data := line.split(";")) != 15:
Expand All @@ -159,7 +168,7 @@ def load_zero_widths() -> "list[bool]":
raw_data[1],
raw_data[2],
]
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
zero_width = cat_code in ["Cc", "Mn", "Me"]

assert current <= codepoint
while current <= codepoint:
Expand All @@ -176,12 +185,68 @@ def load_zero_widths() -> "list[bool]":
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
zw_map.append(False)

return zw_map
# `Default_Ignorable_Code_Point`s also have 0 width:
# https://www.unicode.org/faq/unsup_char.html#3
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
with fetch_open("DerivedCoreProperties.txt") as properties:
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
multiple = re.compile(
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
)

for line in properties.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True

# Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
# as zero-width. This matches the behavior of glibc `wcwidth`.
#
# Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
# a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
# into a single wide grapheme. So we treat vowel and trailing jamo as
# 0-width, such that only the width of the leading jamo is counted
# and the resulting grapheme has width 2.
#
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
with fetch_open("HangulSyllableType.txt") as categories:
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")

for line in categories.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True

# Special case: U+115F HANGUL CHOSEONG FILLER.
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
# (which are considered 0-width on their own) to form a composed Hangul syllable with
# width 2. Therefore, we treat it as having width 2.
zw_map[0x115F] = False
return zw_map


class Bucket:
"""A bucket contains a group of codepoints and an ordered width list. If one bucket's width
list overlaps with another's width list, those buckets can be merged via `try_extend`."""
list overlaps with another's width list, those buckets can be merged via `try_extend`.
"""

def __init__(self):
"""Creates an empty bucket."""
Expand Down Expand Up @@ -230,9 +295,9 @@ def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
same bucket. Returns a list of the buckets in increasing order of those bits."""
num_bits = cap_bit - low_bit
assert num_bits > 0
buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
buckets = [Bucket() for _ in range(0, 2**num_bits)]
mask = (1 << num_bits) - 1
for (codepoint, width) in entries:
for codepoint, width in entries:
buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
return buckets

Expand Down Expand Up @@ -269,7 +334,7 @@ def __init__(
buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))

for bucket in buckets:
for (i, existing) in enumerate(self.indexed):
for i, existing in enumerate(self.indexed):
if existing.try_extend(bucket):
self.entries.append(i)
break
Expand All @@ -283,7 +348,8 @@ def __init__(

def indices_to_widths(self):
"""Destructively converts the indices in this table to the `EffectiveWidth` values of
their buckets. Assumes that no bucket contains codepoints with different widths."""
their buckets. Assumes that no bucket contains codepoints with different widths.
"""
self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
del self.indexed

Expand Down Expand Up @@ -315,7 +381,7 @@ def make_tables(
to include in the top-level table."""
tables = []
entry_groups = [entries]
for (low_bit, cap_bit, offset_type) in table_cfgs:
for low_bit, cap_bit, offset_type in table_cfgs:
table = Table(entry_groups, low_bit, cap_bit, offset_type)
entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
tables.append(table)
Expand All @@ -326,7 +392,8 @@ def emit_module(
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
):
"""Outputs a Rust module to `out_name` using table data from `tables`.
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
"""
if os.path.exists(out_name):
os.remove(out_name)
with open(out_name, "w", newline="\n", encoding="utf-8") as module:
Expand Down Expand Up @@ -432,7 +499,7 @@ def emit_module(
)

subtable_count = 1
for (i, table) in enumerate(tables):
for i, table in enumerate(tables):
new_subtable_count = len(table.buckets())
if i == len(tables) - 1:
table.indices_to_widths() # for the last table, indices == widths
Expand All @@ -442,7 +509,7 @@ def emit_module(
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
)
for (j, byte) in enumerate(byte_array):
for j, byte in enumerate(byte_array):
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
if j % 15 == 0:
module.write("\n ")
Expand All @@ -458,16 +525,17 @@ def main(module_filename: str):
`module_filename`.

We obey the following rules in decreasing order of importance:
- The soft hyphen (`U+00AD`) is single-width.
- Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
- All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
- Hangul jamo medial vowels & final consonants are zero-width.
- All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
- All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
- All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
- All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
- All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.

These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
These rules are based off of UAX11, other Unicode standards, and various `wcwidth()` implementations.
"""
version = load_unicode_version()
print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")

Expand All @@ -482,15 +550,11 @@ def main(module_filename: str):
# Override for soft hyphen
width_map[0x00AD] = EffectiveWidth.NARROW

# Override for Hangul Jamo medial vowels & final consonants
for i in range(0x1160, 0x11FF + 1):
width_map[i] = EffectiveWidth.ZERO

tables = make_tables(TABLE_CFGS, enumerate(width_map))

print("------------------------")
total_size = 0
for (i, table) in enumerate(tables):
for i, table in enumerate(tables):
size_bytes = len(table.to_bytes())
print(f"Table {i} Size: {size_bytes} bytes")
total_size += size_bytes
Expand Down
19 changes: 12 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@
//! ```

#![deny(missing_docs, unsafe_code)]
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]

#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![cfg_attr(feature = "bench", feature(test))]
#![no_std]

Expand Down Expand Up @@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {

impl UnicodeWidthChar for char {
#[inline]
fn width(self) -> Option<usize> { cw::width(self, false) }
fn width(self) -> Option<usize> {
cw::width(self, false)
}

#[inline]
fn width_cjk(self) -> Option<usize> { cw::width(self, true) }
fn width_cjk(self) -> Option<usize> {
cw::width(self, true)
}
}

/// Methods for determining displayed width of Unicode strings.
Expand All @@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
/// non-CJK contexts, or when the context cannot be reliably determined.
fn width<'a>(&'a self) -> usize;
fn width(&self) -> usize;

/// Returns the string's displayed width in columns.
///
Expand All @@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
/// CJK contexts.
fn width_cjk<'a>(&'a self) -> usize;
fn width_cjk(&self) -> usize;
}

impl UnicodeWidthStr for str {
Expand Down
Loading