Skip to content

Commit 6b503fa

Browse files
Support emoji presentation sequences
1 parent 3885393 commit 6b503fa

File tree

6 files changed

+309
-18
lines changed

6 files changed

+309
-18
lines changed

.github/workflows/rust.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ jobs:
3232
runs-on: ubuntu-latest
3333
steps:
3434
- uses: actions/checkout@v3
35+
- uses: actions/setup-python@v5
36+
with:
37+
python-version: '3.12'
3538
- name: Regen
3639
run: cd scripts && python3 unicode.py
3740
- name: Diff

benches/benches.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,11 @@ fn jawiki(b: &mut Bencher) {
104104
let string = std::fs::read_to_string(data_path).unwrap_or_default();
105105
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
106106
}
107+
108+
#[bench]
109+
fn emoji(b: &mut Bencher) {
110+
// To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
111+
let data_path = "bench_data/emoji-style.txt";
112+
let string = std::fs::read_to_string(data_path).unwrap_or_default();
113+
b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
114+
}

scripts/unicode.py

Lines changed: 141 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# - HangulSyllableType.txt
1818
# - PropList.txt
1919
# - ReadMe.txt
20+
# - emoji/emoji-variation-sequences.txt
2021
#
2122
# Since this should not require frequent updates, we just store this
2223
# out-of-line and check the generated module into git.
@@ -26,6 +27,8 @@
2627
import os
2728
import re
2829
import sys
30+
from collections import defaultdict
31+
from itertools import batched
2932

3033
NUM_CODEPOINTS = 0x110000
3134
"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
@@ -69,12 +72,13 @@ def fetch_open(filename: str):
6972
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
7073
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
7174
"""
72-
if not os.path.exists(os.path.basename(filename)):
75+
basename = os.path.basename(filename)
76+
if not os.path.exists(basename):
7377
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
7478
try:
75-
return open(filename, encoding="utf-8")
79+
return open(basename, encoding="utf-8")
7680
except OSError:
77-
sys.stderr.write(f"cannot load {filename}")
81+
sys.stderr.write(f"cannot load {basename}")
7882
sys.exit(1)
7983

8084

@@ -384,8 +388,71 @@ def make_tables(
384388
return tables
385389

386390

391+
def load_variation_sequences() -> "list[int]":
392+
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
393+
an emoji presentation sequence."""
394+
395+
with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
396+
# Match all emoji presentation sequences
397+
# (one codepoint followed by U+FE0F, and labeled "emoji style")
398+
sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
399+
codepoints = []
400+
for line in sequences.readlines():
401+
if match := sequence.match(line):
402+
cp = int(match.group(1), 16)
403+
codepoints.append(cp)
404+
return codepoints
405+
406+
407+
def make_variation_sequence_table(
408+
seqs: "list[int]",
409+
width_map: "list[EffectiveWidth]",
410+
) -> "tuple[list[int], list[list[int]]]":
411+
"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
412+
(Characters that are always wide may be excluded.)
413+
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
414+
"""
415+
416+
prefixes_dict = defaultdict(set)
417+
for cp in seqs:
418+
prefixes_dict[cp >> 10].add(cp & 0x3FF)
419+
420+
# We don't strictly need to keep track of characters that are always wide,
421+
# because being in an emoji variation seq won't affect their width.
422+
# So store their info only when it wouldn't inflate the size of the tables.
423+
for k in list(prefixes_dict.keys()):
424+
if all(
425+
map(
426+
lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
427+
prefixes_dict[k],
428+
)
429+
):
430+
del prefixes_dict[k]
431+
432+
indexes = list(prefixes_dict.keys())
433+
434+
# Similarly, we can spuriously return `true` for always-wide characters
435+
# even if not part of a presentation seq; this saves an additional lookup,
436+
# so we should do it where there is no size cost.
437+
for cp, width in enumerate(width_map):
438+
if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
439+
prefixes_dict[cp >> 10].add(cp & 0x3FF)
440+
441+
leaves = []
442+
for cps in prefixes_dict.values():
443+
leaf = [0] * 128
444+
for cp in cps:
445+
idx_in_leaf, bit_shift = divmod(cp, 8)
446+
leaf[idx_in_leaf] |= 1 << bit_shift
447+
leaves.append(leaf)
448+
return (indexes, leaves)
449+
450+
387451
def emit_module(
388-
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
452+
out_name: str,
453+
unicode_version: "tuple[int, int, int]",
454+
tables: "list[Table]",
455+
variation_table: "tuple[list[int], list[list[int]]]",
389456
):
390457
"""Outputs a Rust module to `out_name` using table data from `tables`.
391458
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -462,6 +529,40 @@ def emit_module(
462529
"""
463530
)
464531

532+
variation_idx, variation_leaves = variation_table
533+
534+
module.write(
535+
"""
536+
/// Whether this character forms an [emoji presentation sequence]
537+
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
538+
/// when followed by `'\\u{FEOF}'`.
539+
/// Emoji presentation sequences are considered to have width 2.
540+
/// This may spuriously return `true` or `false` for characters that are always wide.
541+
#[inline]
542+
pub fn starts_emoji_presentation_seq(c: char) -> bool {
543+
let cp: u32 = c.into();
544+
// First level of lookup uses all but 10 LSB
545+
let top_bits = cp >> 10;
546+
let idx_of_leaf: usize = match top_bits {
547+
"""
548+
)
549+
550+
for i, msbs in enumerate(variation_idx):
551+
module.write(f" {msbs} => {i},\n")
552+
553+
module.write(
554+
""" _ => return false,
555+
};
556+
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
557+
// and use them to index into `leaf_row`.
558+
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
559+
let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
560+
// Use the 3 LSB of `cp` to index into `leaf_byte`.
561+
((leaf_byte >> (cp & 7)) & 1) == 1
562+
}
563+
"""
564+
)
565+
465566
module.write(
466567
"""
467568
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -510,6 +611,29 @@ def emit_module(
510611
module.write(f" 0x{byte:02X},")
511612
module.write("\n ];\n")
512613
subtable_count = new_subtable_count
614+
615+
# emoji table
616+
617+
module.write(
618+
f"""
619+
#[repr(align(128))]
620+
struct Align128<T>(T);
621+
/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
622+
/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
623+
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
624+
"""
625+
)
626+
for leaf in variation_leaves:
627+
module.write(" [\n")
628+
for row in batched(leaf, 14):
629+
module.write(" ")
630+
for entry in row:
631+
module.write(f" 0x{entry:02X},")
632+
module.write("\n")
633+
module.write(" ],\n")
634+
635+
module.write(" ]);\n")
636+
513637
module.write("}\n")
514638

515639

@@ -520,6 +644,7 @@ def main(module_filename: str):
520644
521645
We obey the following rules, in decreasing order of importance:
522646
647+
- Emoji presentation sequences are double-width.
523648
- The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
524649
- Hangul jamo medial vowels & final consonants are zero-width.
525650
- `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
@@ -549,16 +674,25 @@ def main(module_filename: str):
549674

550675
tables = make_tables(TABLE_CFGS, enumerate(width_map))
551676

677+
emoji_variations = load_variation_sequences()
678+
variation_table = make_variation_sequence_table(emoji_variations, width_map)
679+
552680
print("------------------------")
553681
total_size = 0
554682
for i, table in enumerate(tables):
555683
size_bytes = len(table.to_bytes())
556-
print(f"Table {i} Size: {size_bytes} bytes")
684+
print(f"Table {i} size: {size_bytes} bytes")
557685
total_size += size_bytes
686+
emoji_index_size = len(variation_table[0]) * 4
687+
print(f"Emoji presentation index size: {emoji_index_size} bytes")
688+
total_size += emoji_index_size
689+
emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
690+
print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
691+
total_size += emoji_leaves_size
558692
print("------------------------")
559-
print(f" Total Size: {total_size} bytes")
693+
print(f" Total size: {total_size} bytes")
560694

561-
emit_module(module_filename, version, tables)
695+
emit_module(module_filename, version, tables, variation_table)
562696
print(f'Wrote to "{module_filename}"')
563697

564698

src/lib.rs

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,11 @@
3333
//! This crate currently uses the following rules to determine the width of a
3434
//! character or string, in order of decreasing precedence. These may be tweaked in the future.
3535
//!
36-
//! 1. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
37-
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
38-
//! 3. The following have width 0:
36+
//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
37+
//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.)
38+
//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
39+
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
40+
//! 4. The following have width 0:
3941
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4042
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
4143
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -53,15 +55,15 @@
5355
//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
5456
//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
5557
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
56-
//! 4. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
58+
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
5759
//! have no defined width, and are ignored when determining the width of a string.
58-
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60+
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
5961
//! with an [`East_Asian_Width`] of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
6062
//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
61-
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
63+
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
6264
//! with an [`East_Asian_Width`] of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
6365
//! have width 2 in an East Asian context, and width 1 otherwise.
64-
//! 7. All other characters have width 1.
66+
//! 8. All other characters have width 1.
6567
//!
6668
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
6769
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
@@ -122,7 +124,9 @@ impl UnicodeWidthChar for char {
122124
pub trait UnicodeWidthStr {
123125
/// Returns the string's displayed width in columns.
124126
///
125-
/// Control characters are treated as having zero width.
127+
/// Control characters are treated as having zero width,
128+
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
129+
/// are assigned width 2.
126130
///
127131
/// This function treats characters in the Ambiguous category according
128132
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -132,7 +136,9 @@ pub trait UnicodeWidthStr {
132136

133137
/// Returns the string's displayed width in columns.
134138
///
135-
/// Control characters are treated as having zero width.
139+
/// Control characters are treated as having zero width,
140+
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
141+
/// are assigned width 2.
136142
///
137143
/// This function treats characters in the Ambiguous category according
138144
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
@@ -144,11 +150,28 @@ pub trait UnicodeWidthStr {
144150
impl UnicodeWidthStr for str {
145151
#[inline]
146152
fn width(&self) -> usize {
147-
self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum()
153+
str_width(self, false)
148154
}
149155

150156
#[inline]
151157
fn width_cjk(&self) -> usize {
152-
self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum()
158+
str_width(self, true)
153159
}
154160
}
161+
162+
fn str_width(s: &str, is_cjk: bool) -> usize {
163+
s.chars()
164+
.rfold((0, false), |(sum, was_fe0f), c| {
165+
if c == '\u{FE0F}' {
166+
(sum, true)
167+
} else {
168+
let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) {
169+
2
170+
} else {
171+
cw::width(c, is_cjk).unwrap_or(0)
172+
};
173+
(sum + add, false)
174+
}
175+
})
176+
.0
177+
}

0 commit comments

Comments
 (0)