From 130f3fde22fe9bed5da40c55a93430ce262e9a4f Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sat, 10 Feb 2024 16:43:23 -0500 Subject: [PATCH 01/13] Treat emoji presentation sequences as fullwidth --- scripts/unicode.py | 73 ++++++++++++++-- src/lib.rs | 31 ++++++- src/tables.rs | 209 +++++++++++++++++++++++++++++++++++++++++++++ src/tests.rs | 14 +++ 4 files changed, 320 insertions(+), 7 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index e91f001..fe0236c 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -66,12 +66,13 @@ def fetch_open(filename: str): """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure. """ + basename = os.path.basename(filename) if not os.path.exists(os.path.basename(filename)): os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") try: - return open(filename, encoding="utf-8") + return open(basename, encoding="utf-8") except OSError: - sys.stderr.write(f"cannot load {filename}") + sys.stderr.write(f"cannot load {basename}") sys.exit(1) @@ -152,7 +153,8 @@ def load_zero_widths() -> "list[bool]": - it is in general category `Cc`, - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), - - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`), + - or if it is one of U+0CC0, U+0CC7, U+0CC8, U+0CCA, U+0CCB, U+1B3B, U+1B3D, or U+1B43, + - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`) and is not U+115F, - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`). """ @@ -408,8 +410,29 @@ def make_tables( return tables +def variation_sequences() -> "list[tuple[int, int]]": + """Outputs a list of character ranages, corresponding to all the valid characters for starting + an emoji presentation sequence.""" + + with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: + sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") + ranges = [] + for line in sequences.readlines(): + if match := sequence.match(line): + cp = int(match.group(1), 16) + if ranges != [] and ranges[-1][1] == cp - 1: + ranges[-1] = (ranges[-1][0], cp) + else: + ranges.append((cp, cp)) + + return ranges + + def emit_module( - out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" + out_name: str, + unicode_version: "tuple[int, int, int]", + tables: "list[Table]", + emoji_variations: "list[tuple[int, int]]", ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. @@ -486,6 +509,31 @@ def emit_module( """ ) + module.write( + """ + /// Whether this character forms an [emoji presentation sequence] + /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// when followed by `'\\u{FEOF}'`. + /// Emoji presentation sequences are considered to have width 2. + #[inline] + pub fn starts_emoji_presentation_seq(c: char) -> bool { + use core::cmp::Ordering::{Equal, Greater, Less}; + + EMOJI_PRESENTATION_RANGES + .binary_search_by(|&(lo, hi)| { + if lo > c { + Greater + } else if hi < c { + Less + } else { + Equal + } + }) + .is_ok() + } +""" + ) + module.write( """ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or @@ -534,6 +582,20 @@ def emit_module( module.write(f" 0x{byte:02X},") module.write("\n ];\n") subtable_count = new_subtable_count + + # emoji table + + module.write( + f""" + /// Each tuple corresponds to a range (inclusive at both ends) + /// of characters that can start an emoji presentation sequence. + static EMOJI_PRESENTATION_RANGES: [(char, char); {len(emoji_variations)}] = [ +""" + ) + for lo, hi in emoji_variations: + module.write(f" ('\\u{{{lo:X}}}', '\\u{{{hi:X}}}'),\n") + module.write(" ];\n") + module.write("}\n") @@ -569,6 +631,7 @@ def main(module_filename: str): width_map[0x00AD] = EffectiveWidth.NARROW tables = make_tables(TABLE_CFGS, enumerate(width_map)) + emoji_variations = variation_sequences() print("------------------------") total_size = 0 @@ -579,7 +642,7 @@ def main(module_filename: str): print("------------------------") print(f" Total Size: {total_size} bytes") - emit_module(module_filename, version, tables) + emit_module(module_filename, version, tables, emoji_variations) print(f'Wrote to "{module_filename}"') diff --git a/src/lib.rs b/src/lib.rs index 2f22613..aec3b74 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -108,6 +108,11 @@ pub trait UnicodeWidthStr { /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) /// as 1 column wide. This is consistent with the recommendations for /// non-CJK contexts, or when the context cannot be reliably determined. + /// + /// Also consistent with UAX11, this function treats [emoji presentation sequences] + /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// as 2 columns wide. This means that the width of a string may not equal + /// the sum of the widths of its individual characters. fn width(&self) -> usize; /// Returns the string's displayed width in columns. @@ -118,17 +123,39 @@ pub trait UnicodeWidthStr { /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) /// as 2 column wide. This is consistent with the recommendations for /// CJK contexts. + /// + /// Also consistent with UAX11, this function treats [emoji presentation sequences] + /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// as 2 columns wide. This means that the width of a string may not equal + /// the sum of the widths of its individual characters. fn width_cjk(&self) -> usize; } impl UnicodeWidthStr for str { #[inline] fn width(&self) -> usize { - self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum() + str_width(self, false) } #[inline] fn width_cjk(&self) -> usize { - self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum() + str_width(self, true) } } + +fn str_width(s: &str, is_cjk: bool) -> usize { + s.chars() + .rfold((0, false), |(sum, was_fe0f), c| { + if c == '\u{FE0F}' { + (sum, true) + } else { + let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) { + 2 + } else { + cw::width(c, is_cjk).unwrap_or(0) + }; + (sum + add, false) + } + }) + .0 +} diff --git a/src/tables.rs b/src/tables.rs index 8e2e9eb..4e1064d 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -57,6 +57,27 @@ pub mod charwidth { } } + /// Whether this character forms an [emoji presentation sequence] + /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// when followed by `'\u{FEOF}'`. + /// Emoji presentation sequences are considered to have width 2. + #[inline] + pub fn starts_emoji_presentation_seq(c: char) -> bool { + use core::cmp::Ordering::{Equal, Greater, Less}; + + EMOJI_PRESENTATION_RANGES + .binary_search_by(|&(lo, hi)| { + if lo > c { + Greater + } else if hi < c { + Less + } else { + Equal + } + }) + .is_ok() + } + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or /// `None` if `c` is a control character other than `'\x00'`. /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, @@ -538,4 +559,192 @@ pub mod charwidth { 0x55, 0xAA, 0xAA, 0x56, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, ]; + + /// Each tuple corresponds to a range (inclusive at both ends) + /// of characters that can start an emoji presentation sequence. + static EMOJI_PRESENTATION_RANGES: [(char, char); 183] = [ + ('\u{23}', '\u{23}'), + ('\u{2A}', '\u{2A}'), + ('\u{30}', '\u{39}'), + ('\u{A9}', '\u{A9}'), + ('\u{AE}', '\u{AE}'), + ('\u{203C}', '\u{203C}'), + ('\u{2049}', '\u{2049}'), + ('\u{2122}', '\u{2122}'), + ('\u{2139}', '\u{2139}'), + ('\u{2194}', '\u{2199}'), + ('\u{21A9}', '\u{21AA}'), + ('\u{231A}', '\u{231B}'), + ('\u{2328}', '\u{2328}'), + ('\u{23CF}', '\u{23CF}'), + ('\u{23E9}', '\u{23F3}'), + ('\u{23F8}', '\u{23FA}'), + ('\u{24C2}', '\u{24C2}'), + ('\u{25AA}', '\u{25AB}'), + ('\u{25B6}', '\u{25B6}'), + ('\u{25C0}', '\u{25C0}'), + ('\u{25FB}', '\u{25FE}'), + ('\u{2600}', '\u{2604}'), + ('\u{260E}', '\u{260E}'), + ('\u{2611}', '\u{2611}'), + ('\u{2614}', '\u{2615}'), + ('\u{2618}', '\u{2618}'), + ('\u{261D}', '\u{261D}'), + ('\u{2620}', '\u{2620}'), + ('\u{2622}', '\u{2623}'), + ('\u{2626}', '\u{2626}'), + ('\u{262A}', '\u{262A}'), + ('\u{262E}', '\u{262F}'), + ('\u{2638}', '\u{263A}'), + ('\u{2640}', '\u{2640}'), + ('\u{2642}', '\u{2642}'), + ('\u{2648}', '\u{2653}'), + ('\u{265F}', '\u{2660}'), + ('\u{2663}', '\u{2663}'), + ('\u{2665}', '\u{2666}'), + ('\u{2668}', '\u{2668}'), + ('\u{267B}', '\u{267B}'), + ('\u{267E}', '\u{267F}'), + ('\u{2692}', '\u{2697}'), + ('\u{2699}', '\u{2699}'), + ('\u{269B}', '\u{269C}'), + ('\u{26A0}', '\u{26A1}'), + ('\u{26A7}', '\u{26A7}'), + ('\u{26AA}', '\u{26AB}'), + ('\u{26B0}', '\u{26B1}'), + ('\u{26BD}', '\u{26BE}'), + ('\u{26C4}', '\u{26C5}'), + ('\u{26C8}', '\u{26C8}'), + ('\u{26CE}', '\u{26CF}'), + ('\u{26D1}', '\u{26D1}'), + ('\u{26D3}', '\u{26D4}'), + ('\u{26E9}', '\u{26EA}'), + ('\u{26F0}', '\u{26F5}'), + ('\u{26F7}', '\u{26FA}'), + ('\u{26FD}', '\u{26FD}'), + ('\u{2702}', '\u{2702}'), + ('\u{2705}', '\u{2705}'), + ('\u{2708}', '\u{270D}'), + ('\u{270F}', '\u{270F}'), + ('\u{2712}', '\u{2712}'), + ('\u{2714}', '\u{2714}'), + ('\u{2716}', '\u{2716}'), + ('\u{271D}', '\u{271D}'), + ('\u{2721}', '\u{2721}'), + ('\u{2728}', '\u{2728}'), + ('\u{2733}', '\u{2734}'), + ('\u{2744}', '\u{2744}'), + ('\u{2747}', '\u{2747}'), + ('\u{274C}', '\u{274C}'), + ('\u{274E}', '\u{274E}'), + ('\u{2753}', '\u{2755}'), + ('\u{2757}', '\u{2757}'), + ('\u{2763}', '\u{2764}'), + ('\u{2795}', '\u{2797}'), + ('\u{27A1}', '\u{27A1}'), + ('\u{27B0}', '\u{27B0}'), + ('\u{27BF}', '\u{27BF}'), + ('\u{2934}', '\u{2935}'), + ('\u{2B05}', '\u{2B07}'), + ('\u{2B1B}', '\u{2B1C}'), + ('\u{2B50}', '\u{2B50}'), + ('\u{2B55}', '\u{2B55}'), + ('\u{3030}', '\u{3030}'), + ('\u{303D}', '\u{303D}'), + ('\u{3297}', '\u{3297}'), + ('\u{3299}', '\u{3299}'), + ('\u{1F004}', '\u{1F004}'), + ('\u{1F170}', '\u{1F171}'), + ('\u{1F17E}', '\u{1F17F}'), + ('\u{1F202}', '\u{1F202}'), + ('\u{1F21A}', '\u{1F21A}'), + ('\u{1F22F}', '\u{1F22F}'), + ('\u{1F237}', '\u{1F237}'), + ('\u{1F30D}', '\u{1F30F}'), + ('\u{1F315}', '\u{1F315}'), + ('\u{1F31C}', '\u{1F31C}'), + ('\u{1F321}', '\u{1F321}'), + ('\u{1F324}', '\u{1F32C}'), + ('\u{1F336}', '\u{1F336}'), + ('\u{1F378}', '\u{1F378}'), + ('\u{1F37D}', '\u{1F37D}'), + ('\u{1F393}', '\u{1F393}'), + ('\u{1F396}', '\u{1F397}'), + ('\u{1F399}', '\u{1F39B}'), + ('\u{1F39E}', '\u{1F39F}'), + ('\u{1F3A7}', '\u{1F3A7}'), + ('\u{1F3AC}', '\u{1F3AE}'), + ('\u{1F3C2}', '\u{1F3C2}'), + ('\u{1F3C4}', '\u{1F3C4}'), + ('\u{1F3C6}', '\u{1F3C6}'), + ('\u{1F3CA}', '\u{1F3CE}'), + ('\u{1F3D4}', '\u{1F3E0}'), + ('\u{1F3ED}', '\u{1F3ED}'), + ('\u{1F3F3}', '\u{1F3F3}'), + ('\u{1F3F5}', '\u{1F3F5}'), + ('\u{1F3F7}', '\u{1F3F7}'), + ('\u{1F408}', '\u{1F408}'), + ('\u{1F415}', '\u{1F415}'), + ('\u{1F41F}', '\u{1F41F}'), + ('\u{1F426}', '\u{1F426}'), + ('\u{1F43F}', '\u{1F43F}'), + ('\u{1F441}', '\u{1F442}'), + ('\u{1F446}', '\u{1F449}'), + ('\u{1F44D}', '\u{1F44E}'), + ('\u{1F453}', '\u{1F453}'), + ('\u{1F46A}', '\u{1F46A}'), + ('\u{1F47D}', '\u{1F47D}'), + ('\u{1F4A3}', '\u{1F4A3}'), + ('\u{1F4B0}', '\u{1F4B0}'), + ('\u{1F4B3}', '\u{1F4B3}'), + ('\u{1F4BB}', '\u{1F4BB}'), + ('\u{1F4BF}', '\u{1F4BF}'), + ('\u{1F4CB}', '\u{1F4CB}'), + ('\u{1F4DA}', '\u{1F4DA}'), + ('\u{1F4DF}', '\u{1F4DF}'), + ('\u{1F4E4}', '\u{1F4E6}'), + ('\u{1F4EA}', '\u{1F4ED}'), + ('\u{1F4F7}', '\u{1F4F7}'), + ('\u{1F4F9}', '\u{1F4FB}'), + ('\u{1F4FD}', '\u{1F4FD}'), + ('\u{1F508}', '\u{1F508}'), + ('\u{1F50D}', '\u{1F50D}'), + ('\u{1F512}', '\u{1F513}'), + ('\u{1F549}', '\u{1F54A}'), + ('\u{1F550}', '\u{1F567}'), + ('\u{1F56F}', '\u{1F570}'), + ('\u{1F573}', '\u{1F579}'), + ('\u{1F587}', '\u{1F587}'), + ('\u{1F58A}', '\u{1F58D}'), + ('\u{1F590}', '\u{1F590}'), + ('\u{1F5A5}', '\u{1F5A5}'), + ('\u{1F5A8}', '\u{1F5A8}'), + ('\u{1F5B1}', '\u{1F5B2}'), + ('\u{1F5BC}', '\u{1F5BC}'), + ('\u{1F5C2}', '\u{1F5C4}'), + ('\u{1F5D1}', '\u{1F5D3}'), + ('\u{1F5DC}', '\u{1F5DE}'), + ('\u{1F5E1}', '\u{1F5E1}'), + ('\u{1F5E3}', '\u{1F5E3}'), + ('\u{1F5E8}', '\u{1F5E8}'), + ('\u{1F5EF}', '\u{1F5EF}'), + ('\u{1F5F3}', '\u{1F5F3}'), + ('\u{1F5FA}', '\u{1F5FA}'), + ('\u{1F610}', '\u{1F610}'), + ('\u{1F687}', '\u{1F687}'), + ('\u{1F68D}', '\u{1F68D}'), + ('\u{1F691}', '\u{1F691}'), + ('\u{1F694}', '\u{1F694}'), + ('\u{1F698}', '\u{1F698}'), + ('\u{1F6AD}', '\u{1F6AD}'), + ('\u{1F6B2}', '\u{1F6B2}'), + ('\u{1F6B9}', '\u{1F6BA}'), + ('\u{1F6BC}', '\u{1F6BC}'), + ('\u{1F6CB}', '\u{1F6CB}'), + ('\u{1F6CD}', '\u{1F6CF}'), + ('\u{1F6E0}', '\u{1F6E5}'), + ('\u{1F6E9}', '\u{1F6E9}'), + ('\u{1F6F0}', '\u{1F6F0}'), + ('\u{1F6F3}', '\u{1F6F3}'), + ]; } diff --git a/src/tests.rs b/src/tests.rs index 9e3805b..e8f6686 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -272,3 +272,17 @@ fn test_canonical_equivalence() { //assert_eq!(c.width_cjk().unwrap_or(0), nfd.width_cjk(), "{c}, {nfd}"); } } + +#[test] +fn test_emoji_presentation() { + use super::{UnicodeWidthChar, UnicodeWidthStr}; + #[cfg(feature = "no_std")] + use core::option::Option::Some; + + assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1)); + assert_eq!(UnicodeWidthChar::width('\u{FE0F}'), Some(0)); + assert_eq!(UnicodeWidthStr::width("\u{0023}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("a\u{0023}\u{FE0F}a"), 4); + assert_eq!(UnicodeWidthStr::width("\u{0023}a\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("a\u{FE0F}"), 1); +} From 6bd8215852ba0110950175aaf074892b381f756b Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 13 Feb 2024 20:49:17 -0500 Subject: [PATCH 02/13] emoji presentation: store single codepoints instead of ranges --- scripts/unicode.py | 42 ++--- src/tables.rs | 412 +++++++++++++++++++++++---------------------- 2 files changed, 230 insertions(+), 224 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index fe0236c..25bbd9e 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -410,29 +410,29 @@ def make_tables( return tables -def variation_sequences() -> "list[tuple[int, int]]": +def load_variation_sequences(width_map) -> "list[int]": """Outputs a list of character ranages, corresponding to all the valid characters for starting - an emoji presentation sequence.""" + an emoji presentation sequence, exclusing those that are always wide.""" with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") - ranges = [] + codepoints = [] for line in sequences.readlines(): if match := sequence.match(line): cp = int(match.group(1), 16) - if ranges != [] and ranges[-1][1] == cp - 1: - ranges[-1] = (ranges[-1][0], cp) - else: - ranges.append((cp, cp)) - - return ranges + if width_map[cp] == EffectiveWidth.WIDE: + # this character would be width 2 even outside a variation sequence, + # so we don't need to store its info + continue + codepoints.append(cp) + return codepoints def emit_module( out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]", - emoji_variations: "list[tuple[int, int]]", + emoji_variations: "list[int]", ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. @@ -517,19 +517,7 @@ def emit_module( /// Emoji presentation sequences are considered to have width 2. #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool { - use core::cmp::Ordering::{Equal, Greater, Less}; - - EMOJI_PRESENTATION_RANGES - .binary_search_by(|&(lo, hi)| { - if lo > c { - Greater - } else if hi < c { - Less - } else { - Equal - } - }) - .is_ok() + EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok() } """ ) @@ -589,11 +577,11 @@ def emit_module( f""" /// Each tuple corresponds to a range (inclusive at both ends) /// of characters that can start an emoji presentation sequence. - static EMOJI_PRESENTATION_RANGES: [(char, char); {len(emoji_variations)}] = [ + static EMOJI_PRESENTATION_RANGES: [char; {len(emoji_variations)}] = [ """ ) - for lo, hi in emoji_variations: - module.write(f" ('\\u{{{lo:X}}}', '\\u{{{hi:X}}}'),\n") + for cp in emoji_variations: + module.write(f" '\\u{{{cp:X}}}',\n") module.write(" ];\n") module.write("}\n") @@ -631,7 +619,7 @@ def main(module_filename: str): width_map[0x00AD] = EffectiveWidth.NARROW tables = make_tables(TABLE_CFGS, enumerate(width_map)) - emoji_variations = variation_sequences() + emoji_variations = load_variation_sequences(width_map) print("------------------------") total_size = 0 diff --git a/src/tables.rs b/src/tables.rs index 4e1064d..1f92bdb 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -63,19 +63,7 @@ pub mod charwidth { /// Emoji presentation sequences are considered to have width 2. #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool { - use core::cmp::Ordering::{Equal, Greater, Less}; - - EMOJI_PRESENTATION_RANGES - .binary_search_by(|&(lo, hi)| { - if lo > c { - Greater - } else if hi < c { - Less - } else { - Equal - } - }) - .is_ok() + EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok() } /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or @@ -562,189 +550,219 @@ pub mod charwidth { /// Each tuple corresponds to a range (inclusive at both ends) /// of characters that can start an emoji presentation sequence. - static EMOJI_PRESENTATION_RANGES: [(char, char); 183] = [ - ('\u{23}', '\u{23}'), - ('\u{2A}', '\u{2A}'), - ('\u{30}', '\u{39}'), - ('\u{A9}', '\u{A9}'), - ('\u{AE}', '\u{AE}'), - ('\u{203C}', '\u{203C}'), - ('\u{2049}', '\u{2049}'), - ('\u{2122}', '\u{2122}'), - ('\u{2139}', '\u{2139}'), - ('\u{2194}', '\u{2199}'), - ('\u{21A9}', '\u{21AA}'), - ('\u{231A}', '\u{231B}'), - ('\u{2328}', '\u{2328}'), - ('\u{23CF}', '\u{23CF}'), - ('\u{23E9}', '\u{23F3}'), - ('\u{23F8}', '\u{23FA}'), - ('\u{24C2}', '\u{24C2}'), - ('\u{25AA}', '\u{25AB}'), - ('\u{25B6}', '\u{25B6}'), - ('\u{25C0}', '\u{25C0}'), - ('\u{25FB}', '\u{25FE}'), - ('\u{2600}', '\u{2604}'), - ('\u{260E}', '\u{260E}'), - ('\u{2611}', '\u{2611}'), - ('\u{2614}', '\u{2615}'), - ('\u{2618}', '\u{2618}'), - ('\u{261D}', '\u{261D}'), - ('\u{2620}', '\u{2620}'), - ('\u{2622}', '\u{2623}'), - ('\u{2626}', '\u{2626}'), - ('\u{262A}', '\u{262A}'), - ('\u{262E}', '\u{262F}'), - ('\u{2638}', '\u{263A}'), - ('\u{2640}', '\u{2640}'), - ('\u{2642}', '\u{2642}'), - ('\u{2648}', '\u{2653}'), - ('\u{265F}', '\u{2660}'), - ('\u{2663}', '\u{2663}'), - ('\u{2665}', '\u{2666}'), - ('\u{2668}', '\u{2668}'), - ('\u{267B}', '\u{267B}'), - ('\u{267E}', '\u{267F}'), - ('\u{2692}', '\u{2697}'), - ('\u{2699}', '\u{2699}'), - ('\u{269B}', '\u{269C}'), - ('\u{26A0}', '\u{26A1}'), - ('\u{26A7}', '\u{26A7}'), - ('\u{26AA}', '\u{26AB}'), - ('\u{26B0}', '\u{26B1}'), - ('\u{26BD}', '\u{26BE}'), - ('\u{26C4}', '\u{26C5}'), - ('\u{26C8}', '\u{26C8}'), - ('\u{26CE}', '\u{26CF}'), - ('\u{26D1}', '\u{26D1}'), - ('\u{26D3}', '\u{26D4}'), - ('\u{26E9}', '\u{26EA}'), - ('\u{26F0}', '\u{26F5}'), - ('\u{26F7}', '\u{26FA}'), - ('\u{26FD}', '\u{26FD}'), - ('\u{2702}', '\u{2702}'), - ('\u{2705}', '\u{2705}'), - ('\u{2708}', '\u{270D}'), - ('\u{270F}', '\u{270F}'), - ('\u{2712}', '\u{2712}'), - ('\u{2714}', '\u{2714}'), - ('\u{2716}', '\u{2716}'), - ('\u{271D}', '\u{271D}'), - ('\u{2721}', '\u{2721}'), - ('\u{2728}', '\u{2728}'), - ('\u{2733}', '\u{2734}'), - ('\u{2744}', '\u{2744}'), - ('\u{2747}', '\u{2747}'), - ('\u{274C}', '\u{274C}'), - ('\u{274E}', '\u{274E}'), - ('\u{2753}', '\u{2755}'), - ('\u{2757}', '\u{2757}'), - ('\u{2763}', '\u{2764}'), - ('\u{2795}', '\u{2797}'), - ('\u{27A1}', '\u{27A1}'), - ('\u{27B0}', '\u{27B0}'), - ('\u{27BF}', '\u{27BF}'), - ('\u{2934}', '\u{2935}'), - ('\u{2B05}', '\u{2B07}'), - ('\u{2B1B}', '\u{2B1C}'), - ('\u{2B50}', '\u{2B50}'), - ('\u{2B55}', '\u{2B55}'), - ('\u{3030}', '\u{3030}'), - ('\u{303D}', '\u{303D}'), - ('\u{3297}', '\u{3297}'), - ('\u{3299}', '\u{3299}'), - ('\u{1F004}', '\u{1F004}'), - ('\u{1F170}', '\u{1F171}'), - ('\u{1F17E}', '\u{1F17F}'), - ('\u{1F202}', '\u{1F202}'), - ('\u{1F21A}', '\u{1F21A}'), - ('\u{1F22F}', '\u{1F22F}'), - ('\u{1F237}', '\u{1F237}'), - ('\u{1F30D}', '\u{1F30F}'), - ('\u{1F315}', '\u{1F315}'), - ('\u{1F31C}', '\u{1F31C}'), - ('\u{1F321}', '\u{1F321}'), - ('\u{1F324}', '\u{1F32C}'), - ('\u{1F336}', '\u{1F336}'), - ('\u{1F378}', '\u{1F378}'), - ('\u{1F37D}', '\u{1F37D}'), - ('\u{1F393}', '\u{1F393}'), - ('\u{1F396}', '\u{1F397}'), - ('\u{1F399}', '\u{1F39B}'), - ('\u{1F39E}', '\u{1F39F}'), - ('\u{1F3A7}', '\u{1F3A7}'), - ('\u{1F3AC}', '\u{1F3AE}'), - ('\u{1F3C2}', '\u{1F3C2}'), - ('\u{1F3C4}', '\u{1F3C4}'), - ('\u{1F3C6}', '\u{1F3C6}'), - ('\u{1F3CA}', '\u{1F3CE}'), - ('\u{1F3D4}', '\u{1F3E0}'), - ('\u{1F3ED}', '\u{1F3ED}'), - ('\u{1F3F3}', '\u{1F3F3}'), - ('\u{1F3F5}', '\u{1F3F5}'), - ('\u{1F3F7}', '\u{1F3F7}'), - ('\u{1F408}', '\u{1F408}'), - ('\u{1F415}', '\u{1F415}'), - ('\u{1F41F}', '\u{1F41F}'), - ('\u{1F426}', '\u{1F426}'), - ('\u{1F43F}', '\u{1F43F}'), - ('\u{1F441}', '\u{1F442}'), - ('\u{1F446}', '\u{1F449}'), - ('\u{1F44D}', '\u{1F44E}'), - ('\u{1F453}', '\u{1F453}'), - ('\u{1F46A}', '\u{1F46A}'), - ('\u{1F47D}', '\u{1F47D}'), - ('\u{1F4A3}', '\u{1F4A3}'), - ('\u{1F4B0}', '\u{1F4B0}'), - ('\u{1F4B3}', '\u{1F4B3}'), - ('\u{1F4BB}', '\u{1F4BB}'), - ('\u{1F4BF}', '\u{1F4BF}'), - ('\u{1F4CB}', '\u{1F4CB}'), - ('\u{1F4DA}', '\u{1F4DA}'), - ('\u{1F4DF}', '\u{1F4DF}'), - ('\u{1F4E4}', '\u{1F4E6}'), - ('\u{1F4EA}', '\u{1F4ED}'), - ('\u{1F4F7}', '\u{1F4F7}'), - ('\u{1F4F9}', '\u{1F4FB}'), - ('\u{1F4FD}', '\u{1F4FD}'), - ('\u{1F508}', '\u{1F508}'), - ('\u{1F50D}', '\u{1F50D}'), - ('\u{1F512}', '\u{1F513}'), - ('\u{1F549}', '\u{1F54A}'), - ('\u{1F550}', '\u{1F567}'), - ('\u{1F56F}', '\u{1F570}'), - ('\u{1F573}', '\u{1F579}'), - ('\u{1F587}', '\u{1F587}'), - ('\u{1F58A}', '\u{1F58D}'), - ('\u{1F590}', '\u{1F590}'), - ('\u{1F5A5}', '\u{1F5A5}'), - ('\u{1F5A8}', '\u{1F5A8}'), - ('\u{1F5B1}', '\u{1F5B2}'), - ('\u{1F5BC}', '\u{1F5BC}'), - ('\u{1F5C2}', '\u{1F5C4}'), - ('\u{1F5D1}', '\u{1F5D3}'), - ('\u{1F5DC}', '\u{1F5DE}'), - ('\u{1F5E1}', '\u{1F5E1}'), - ('\u{1F5E3}', '\u{1F5E3}'), - ('\u{1F5E8}', '\u{1F5E8}'), - ('\u{1F5EF}', '\u{1F5EF}'), - ('\u{1F5F3}', '\u{1F5F3}'), - ('\u{1F5FA}', '\u{1F5FA}'), - ('\u{1F610}', '\u{1F610}'), - ('\u{1F687}', '\u{1F687}'), - ('\u{1F68D}', '\u{1F68D}'), - ('\u{1F691}', '\u{1F691}'), - ('\u{1F694}', '\u{1F694}'), - ('\u{1F698}', '\u{1F698}'), - ('\u{1F6AD}', '\u{1F6AD}'), - ('\u{1F6B2}', '\u{1F6B2}'), - ('\u{1F6B9}', '\u{1F6BA}'), - ('\u{1F6BC}', '\u{1F6BC}'), - ('\u{1F6CB}', '\u{1F6CB}'), - ('\u{1F6CD}', '\u{1F6CF}'), - ('\u{1F6E0}', '\u{1F6E5}'), - ('\u{1F6E9}', '\u{1F6E9}'), - ('\u{1F6F0}', '\u{1F6F0}'), - ('\u{1F6F3}', '\u{1F6F3}'), + static EMOJI_PRESENTATION_RANGES: [char; 213] = [ + '\u{23}', + '\u{2A}', + '\u{30}', + '\u{31}', + '\u{32}', + '\u{33}', + '\u{34}', + '\u{35}', + '\u{36}', + '\u{37}', + '\u{38}', + '\u{39}', + '\u{A9}', + '\u{AE}', + '\u{203C}', + '\u{2049}', + '\u{2122}', + '\u{2139}', + '\u{2194}', + '\u{2195}', + '\u{2196}', + '\u{2197}', + '\u{2198}', + '\u{2199}', + '\u{21A9}', + '\u{21AA}', + '\u{2328}', + '\u{23CF}', + '\u{23ED}', + '\u{23EE}', + '\u{23EF}', + '\u{23F1}', + '\u{23F2}', + '\u{23F8}', + '\u{23F9}', + '\u{23FA}', + '\u{24C2}', + '\u{25AA}', + '\u{25AB}', + '\u{25B6}', + '\u{25C0}', + '\u{25FB}', + '\u{25FC}', + '\u{2600}', + '\u{2601}', + '\u{2602}', + '\u{2603}', + '\u{2604}', + '\u{260E}', + '\u{2611}', + '\u{2618}', + '\u{261D}', + '\u{2620}', + '\u{2622}', + '\u{2623}', + '\u{2626}', + '\u{262A}', + '\u{262E}', + '\u{262F}', + '\u{2638}', + '\u{2639}', + '\u{263A}', + '\u{2640}', + '\u{2642}', + '\u{265F}', + '\u{2660}', + '\u{2663}', + '\u{2665}', + '\u{2666}', + '\u{2668}', + '\u{267B}', + '\u{267E}', + '\u{2692}', + '\u{2694}', + '\u{2695}', + '\u{2696}', + '\u{2697}', + '\u{2699}', + '\u{269B}', + '\u{269C}', + '\u{26A0}', + '\u{26A7}', + '\u{26B0}', + '\u{26B1}', + '\u{26C8}', + '\u{26CF}', + '\u{26D1}', + '\u{26D3}', + '\u{26E9}', + '\u{26F0}', + '\u{26F1}', + '\u{26F4}', + '\u{26F7}', + '\u{26F8}', + '\u{26F9}', + '\u{2702}', + '\u{2708}', + '\u{2709}', + '\u{270C}', + '\u{270D}', + '\u{270F}', + '\u{2712}', + '\u{2714}', + '\u{2716}', + '\u{271D}', + '\u{2721}', + '\u{2733}', + '\u{2734}', + '\u{2744}', + '\u{2747}', + '\u{2763}', + '\u{2764}', + '\u{27A1}', + '\u{2934}', + '\u{2935}', + '\u{2B05}', + '\u{2B06}', + '\u{2B07}', + '\u{1F170}', + '\u{1F171}', + '\u{1F17E}', + '\u{1F17F}', + '\u{1F321}', + '\u{1F324}', + '\u{1F325}', + '\u{1F326}', + '\u{1F327}', + '\u{1F328}', + '\u{1F329}', + '\u{1F32A}', + '\u{1F32B}', + '\u{1F32C}', + '\u{1F336}', + '\u{1F37D}', + '\u{1F396}', + '\u{1F397}', + '\u{1F399}', + '\u{1F39A}', + '\u{1F39B}', + '\u{1F39E}', + '\u{1F39F}', + '\u{1F3CB}', + '\u{1F3CC}', + '\u{1F3CD}', + '\u{1F3CE}', + '\u{1F3D4}', + '\u{1F3D5}', + '\u{1F3D6}', + '\u{1F3D7}', + '\u{1F3D8}', + '\u{1F3D9}', + '\u{1F3DA}', + '\u{1F3DB}', + '\u{1F3DC}', + '\u{1F3DD}', + '\u{1F3DE}', + '\u{1F3DF}', + '\u{1F3F3}', + '\u{1F3F5}', + '\u{1F3F7}', + '\u{1F43F}', + '\u{1F441}', + '\u{1F4FD}', + '\u{1F549}', + '\u{1F54A}', + '\u{1F56F}', + '\u{1F570}', + '\u{1F573}', + '\u{1F574}', + '\u{1F575}', + '\u{1F576}', + '\u{1F577}', + '\u{1F578}', + '\u{1F579}', + '\u{1F587}', + '\u{1F58A}', + '\u{1F58B}', + '\u{1F58C}', + '\u{1F58D}', + '\u{1F590}', + '\u{1F5A5}', + '\u{1F5A8}', + '\u{1F5B1}', + '\u{1F5B2}', + '\u{1F5BC}', + '\u{1F5C2}', + '\u{1F5C3}', + '\u{1F5C4}', + '\u{1F5D1}', + '\u{1F5D2}', + '\u{1F5D3}', + '\u{1F5DC}', + '\u{1F5DD}', + '\u{1F5DE}', + '\u{1F5E1}', + '\u{1F5E3}', + '\u{1F5E8}', + '\u{1F5EF}', + '\u{1F5F3}', + '\u{1F5FA}', + '\u{1F6CB}', + '\u{1F6CD}', + '\u{1F6CE}', + '\u{1F6CF}', + '\u{1F6E0}', + '\u{1F6E1}', + '\u{1F6E2}', + '\u{1F6E3}', + '\u{1F6E4}', + '\u{1F6E5}', + '\u{1F6E9}', + '\u{1F6F0}', + '\u{1F6F3}', ]; } From a4d25a96e5a620c381cc64a7c4043874d4c8175f Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Wed, 14 Feb 2024 10:23:51 -0500 Subject: [PATCH 03/13] Use a better datastructure Faster and smaller! --- .github/workflows/rust.yml | 3 + Cargo.toml | 9 +- scripts/unicode.py | 138 +++++++++++++--- src/lib.rs | 3 +- src/tables.rs | 328 +++++++++++++------------------------ src/tests.rs | 7 + 6 files changed, 247 insertions(+), 241 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 89c5f57..7731d4c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,6 +22,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' - name: Regen run: cd scripts && python3 unicode.py - name: Diff diff --git a/Cargo.toml b/Cargo.toml index bd8da9c..7c44aa6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,20 +2,23 @@ name = "unicode-width" version = "0.1.11" -authors = ["kwantam ", "Manish Goregaokar "] - +authors = [ + "kwantam ", + "Manish Goregaokar ", +] homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" documentation = "https://unicode-rs.github.io/unicode-width" license = "MIT/Apache-2.0" keywords = ["text", "width", "unicode"] readme = "README.md" +edition = "2021" description = """ Determine displayed width of `char` and `str` types according to Unicode Standard Annex #11 rules. """ -exclude = [ "target/*", "Cargo.lock" ] +exclude = ["target/*", "Cargo.lock"] [dependencies] std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } diff --git a/scripts/unicode.py b/scripts/unicode.py index 25bbd9e..4195c65 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -23,6 +23,8 @@ import os import re import sys +from collections import defaultdict +from itertools import batched NUM_CODEPOINTS = 0x110000 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" @@ -410,9 +412,9 @@ def make_tables( return tables -def load_variation_sequences(width_map) -> "list[int]": +def load_variation_sequences() -> "list[int]": """Outputs a list of character ranages, corresponding to all the valid characters for starting - an emoji presentation sequence, exclusing those that are always wide.""" + an emoji presentation sequence.""" with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") @@ -420,19 +422,68 @@ def load_variation_sequences(width_map) -> "list[int]": for line in sequences.readlines(): if match := sequence.match(line): cp = int(match.group(1), 16) - if width_map[cp] == EffectiveWidth.WIDE: - # this character would be width 2 even outside a variation sequence, - # so we don't need to store its info - continue codepoints.append(cp) return codepoints +def make_variation_sequence_table( + seqs: "list[int]", + width_map, +) -> "tuple[list[int], list[list[int]]]": + """Generates 2-level look up table for whether a codepoint might start an emoji presentation sequence. + (Characters that are always wide may be excluded.) + First level maps the most significant byte to a 4-bit index (or 0xFF if can't possibly start such a sequence), + second level is a bit array (each leaf is 512 bits long).""" + # The structure of the table currently relies on this. + # It's unlikely to be a problem in the near future + # as this is enough to encompass the entire Basic Multilingual Plane and + # Supplementary Multilingual Plane. + # And the fix is easy if it ever does become a problem: + # just check bits 1 more significant for the index, + # and use 1024-bit leaves instead of 512-bit. + assert seqs[-1] <= 0x1FFFF + + prefixes_dict = defaultdict(list) + for cp in seqs: + prefixes_dict[cp >> 9].append(cp & 0x1FF) + + # We don't strictly need to keep track of characters that are always wide, + # because being in an emoji variation seq won't affect their width. + # So store their info only when it wouldn't inflate the size of the tables. + keys = list(prefixes_dict.keys()) + for k in keys: + if all(map(lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, prefixes_dict[k])): + del prefixes_dict[k] + + # Another assumption made by the data structure. + # Ensures 4 bits are enough to index into subtable + assert len(prefixes_dict.keys()) <= 15 + index_nibbles = [0xF] * 256 + for idx, k in enumerate(prefixes_dict.keys()): + index_nibbles[k] = idx + + index = [] + for tup in batched(index_nibbles, 2): + next = 0 + for i in range(0, 2): + next |= tup[i] << (4 * i) + index.append(next) + + leaves = [] + for leaf_idx, cps in enumerate(prefixes_dict.values()): + leaf = [0] * 64 + for cp in cps: + idx_in_leaf, bit_shift = divmod(cp, 8) + leaf[idx_in_leaf] |= 1 << bit_shift + leaves.append(leaf) + return (index, leaves) + + def emit_module( out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]", - emoji_variations: "list[int]", + variation_table: "tuple[list[int], list[list[int]]]", ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. @@ -509,16 +560,33 @@ def emit_module( """ ) + variation_idx, variation_leaves = variation_table + module.write( - """ + f""" /// Whether this character forms an [emoji presentation sequence] /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) - /// when followed by `'\\u{FEOF}'`. + /// when followed by `'\\u{{FEOF}}'`. /// Emoji presentation sequences are considered to have width 2. + /// This may spuriously return `false` for all characters that are always wide. #[inline] - pub fn starts_emoji_presentation_seq(c: char) -> bool { - EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok() - } + pub fn starts_emoji_presentation_seq(c: char) -> bool {{ + let cp: u32 = c.into(); + let Ok(top_byte): Result = ((cp) >> 9).try_into() else {{ + return false; + }}; + + let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; + let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; + if index_nibble >= {len(variation_leaves)} {{ + return false; + }} + + let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)] + [usize::try_from((cp >> 3) & 0x3F).unwrap()]; + + ((leaf_byte >> (cp & 7)) & 1) == 1 + }} """ ) @@ -575,15 +643,36 @@ def emit_module( module.write( f""" - /// Each tuple corresponds to a range (inclusive at both ends) - /// of characters that can start an emoji presentation sequence. - static EMOJI_PRESENTATION_RANGES: [char; {len(emoji_variations)}] = [ + /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0) + /// of the char you want to test. 0xF means it's not part of a presentation seq, + /// anything else means index into the next table. + static EMOJI_PRESENTATION_INDEX: [u8; {len(variation_idx)}] = [ """ ) - for cp in emoji_variations: - module.write(f" '\\u{{{cp:X}}}',\n") + for row in batched(variation_idx, 15): + module.write(" ") + for idx in row: + module.write(f" 0x{idx:02X},") + module.write("\n") module.write(" ];\n") + module.write( + f""" + /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; {len(variation_leaves)}] = [ +""" + ) + for leaf in variation_leaves: + module.write(" [\n") + for row in batched(leaf, 14): + module.write(" ") + for entry in row: + module.write(f" 0x{entry:02X},") + module.write("\n") + module.write(" ],\n") + + module.write(" ];\n") module.write("}\n") @@ -593,6 +682,7 @@ def main(module_filename: str): `module_filename`. We obey the following rules in decreasing order of importance: + - Emoji presentation sequences are double-width. - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c) - Hangul jamo medial vowels & final consonants are zero-width. - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. @@ -619,18 +709,26 @@ def main(module_filename: str): width_map[0x00AD] = EffectiveWidth.NARROW tables = make_tables(TABLE_CFGS, enumerate(width_map)) - emoji_variations = load_variation_sequences(width_map) + + emoji_variations = load_variation_sequences() + variation_table = make_variation_sequence_table(emoji_variations, width_map) print("------------------------") total_size = 0 for i, table in enumerate(tables): size_bytes = len(table.to_bytes()) - print(f"Table {i} Size: {size_bytes} bytes") + print(f"Table {i} size: {size_bytes} bytes") total_size += size_bytes + emoji_index_size = len(variation_table[0]) + print(f"Emoji Presentation Index Size: {emoji_index_size} bytes") + total_size += emoji_index_size + emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0]) + print(f"Emoji Presentation Leaves Size: {emoji_leaves_size} bytes") + total_size += emoji_leaves_size print("------------------------") print(f" Total Size: {total_size} bytes") - emit_module(module_filename, version, tables, emoji_variations) + emit_module(module_filename, version, tables, variation_table) print(f'Wrote to "{module_filename}"') diff --git a/src/lib.rs b/src/lib.rs index aec3b74..45d97e0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,7 +42,8 @@ //! unicode-width = "0.1.5" //! ``` -#![deny(missing_docs, unsafe_code)] +#![forbid(unsafe_code)] +#![deny(missing_docs)] #![doc( html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" diff --git a/src/tables.rs b/src/tables.rs index 1f92bdb..26da4be 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -61,9 +61,24 @@ pub mod charwidth { /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) /// when followed by `'\u{FEOF}'`. /// Emoji presentation sequences are considered to have width 2. + /// This may spuriously return `false` for all characters that are always wide. #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool { - EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok() + let cp: u32 = c.into(); + let Ok(top_byte): Result = ((cp) >> 9).try_into() else { + return false; + }; + + let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; + let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; + if index_nibble >= 11 { + return false; + } + + let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)] + [usize::try_from((cp >> 3) & 0x3F).unwrap()]; + + ((leaf_byte >> (cp & 7)) & 1) == 1 } /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or @@ -548,221 +563,100 @@ pub mod charwidth { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, ]; - /// Each tuple corresponds to a range (inclusive at both ends) - /// of characters that can start an emoji presentation sequence. - static EMOJI_PRESENTATION_RANGES: [char; 213] = [ - '\u{23}', - '\u{2A}', - '\u{30}', - '\u{31}', - '\u{32}', - '\u{33}', - '\u{34}', - '\u{35}', - '\u{36}', - '\u{37}', - '\u{38}', - '\u{39}', - '\u{A9}', - '\u{AE}', - '\u{203C}', - '\u{2049}', - '\u{2122}', - '\u{2139}', - '\u{2194}', - '\u{2195}', - '\u{2196}', - '\u{2197}', - '\u{2198}', - '\u{2199}', - '\u{21A9}', - '\u{21AA}', - '\u{2328}', - '\u{23CF}', - '\u{23ED}', - '\u{23EE}', - '\u{23EF}', - '\u{23F1}', - '\u{23F2}', - '\u{23F8}', - '\u{23F9}', - '\u{23FA}', - '\u{24C2}', - '\u{25AA}', - '\u{25AB}', - '\u{25B6}', - '\u{25C0}', - '\u{25FB}', - '\u{25FC}', - '\u{2600}', - '\u{2601}', - '\u{2602}', - '\u{2603}', - '\u{2604}', - '\u{260E}', - '\u{2611}', - '\u{2618}', - '\u{261D}', - '\u{2620}', - '\u{2622}', - '\u{2623}', - '\u{2626}', - '\u{262A}', - '\u{262E}', - '\u{262F}', - '\u{2638}', - '\u{2639}', - '\u{263A}', - '\u{2640}', - '\u{2642}', - '\u{265F}', - '\u{2660}', - '\u{2663}', - '\u{2665}', - '\u{2666}', - '\u{2668}', - '\u{267B}', - '\u{267E}', - '\u{2692}', - '\u{2694}', - '\u{2695}', - '\u{2696}', - '\u{2697}', - '\u{2699}', - '\u{269B}', - '\u{269C}', - '\u{26A0}', - '\u{26A7}', - '\u{26B0}', - '\u{26B1}', - '\u{26C8}', - '\u{26CF}', - '\u{26D1}', - '\u{26D3}', - '\u{26E9}', - '\u{26F0}', - '\u{26F1}', - '\u{26F4}', - '\u{26F7}', - '\u{26F8}', - '\u{26F9}', - '\u{2702}', - '\u{2708}', - '\u{2709}', - '\u{270C}', - '\u{270D}', - '\u{270F}', - '\u{2712}', - '\u{2714}', - '\u{2716}', - '\u{271D}', - '\u{2721}', - '\u{2733}', - '\u{2734}', - '\u{2744}', - '\u{2747}', - '\u{2763}', - '\u{2764}', - '\u{27A1}', - '\u{2934}', - '\u{2935}', - '\u{2B05}', - '\u{2B06}', - '\u{2B07}', - '\u{1F170}', - '\u{1F171}', - '\u{1F17E}', - '\u{1F17F}', - '\u{1F321}', - '\u{1F324}', - '\u{1F325}', - '\u{1F326}', - '\u{1F327}', - '\u{1F328}', - '\u{1F329}', - '\u{1F32A}', - '\u{1F32B}', - '\u{1F32C}', - '\u{1F336}', - '\u{1F37D}', - '\u{1F396}', - '\u{1F397}', - '\u{1F399}', - '\u{1F39A}', - '\u{1F39B}', - '\u{1F39E}', - '\u{1F39F}', - '\u{1F3CB}', - '\u{1F3CC}', - '\u{1F3CD}', - '\u{1F3CE}', - '\u{1F3D4}', - '\u{1F3D5}', - '\u{1F3D6}', - '\u{1F3D7}', - '\u{1F3D8}', - '\u{1F3D9}', - '\u{1F3DA}', - '\u{1F3DB}', - '\u{1F3DC}', - '\u{1F3DD}', - '\u{1F3DE}', - '\u{1F3DF}', - '\u{1F3F3}', - '\u{1F3F5}', - '\u{1F3F7}', - '\u{1F43F}', - '\u{1F441}', - '\u{1F4FD}', - '\u{1F549}', - '\u{1F54A}', - '\u{1F56F}', - '\u{1F570}', - '\u{1F573}', - '\u{1F574}', - '\u{1F575}', - '\u{1F576}', - '\u{1F577}', - '\u{1F578}', - '\u{1F579}', - '\u{1F587}', - '\u{1F58A}', - '\u{1F58B}', - '\u{1F58C}', - '\u{1F58D}', - '\u{1F590}', - '\u{1F5A5}', - '\u{1F5A8}', - '\u{1F5B1}', - '\u{1F5B2}', - '\u{1F5BC}', - '\u{1F5C2}', - '\u{1F5C3}', - '\u{1F5C4}', - '\u{1F5D1}', - '\u{1F5D2}', - '\u{1F5D3}', - '\u{1F5DC}', - '\u{1F5DD}', - '\u{1F5DE}', - '\u{1F5E1}', - '\u{1F5E3}', - '\u{1F5E8}', - '\u{1F5EF}', - '\u{1F5F3}', - '\u{1F5FA}', - '\u{1F6CB}', - '\u{1F6CD}', - '\u{1F6CE}', - '\u{1F6CF}', - '\u{1F6E0}', - '\u{1F6E1}', - '\u{1F6E2}', - '\u{1F6E3}', - '\u{1F6E4}', - '\u{1F6E5}', - '\u{1F6E9}', - '\u{1F6F0}', - '\u{1F6F3}', + /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0) + /// of the char you want to test. 0xF means it's not part of a presentation seq, + /// anything else means index into the next table. + static EMOJI_PRESENTATION_INDEX: [u8; 128] = [ + 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x21, 0x43, 0x65, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0xA9, 0xFF, 0xFF, + ]; + + /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; 11] = [ + [ + 0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x03, 0x00, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, 0x0F, 0x07, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x40, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, + ], + [ + 0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4, 0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01, + 0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A, 0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00, + 0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF, 0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50, + 0xB8, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x04, 0x00, 0x00, 0x04, 0x00, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, + 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, 0xA8, 0x00, + ], + [ + 0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04, + 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84, + 0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, + 0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10, + 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, + ], + [ + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x80, 0x20, 0x12, 0x01, 0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, + 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], ]; } diff --git a/src/tests.rs b/src/tests.rs index e8f6686..33815c8 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -285,4 +285,11 @@ fn test_emoji_presentation() { assert_eq!(UnicodeWidthStr::width("a\u{0023}\u{FE0F}a"), 4); assert_eq!(UnicodeWidthStr::width("\u{0023}a\u{FE0F}"), 2); assert_eq!(UnicodeWidthStr::width("a\u{FE0F}"), 1); + assert_eq!(UnicodeWidthStr::width("\u{0023}\u{0023}\u{FE0F}a"), 4); + + assert_eq!(UnicodeWidthStr::width("\u{002A}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{23F9}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{24C2}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{1F6F3}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{1F700}\u{FE0F}"), 1); } From 51a8417472f9805384a5c27534a62252cc9415ec Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Wed, 14 Feb 2024 11:41:31 -0500 Subject: [PATCH 04/13] Document exact width rules --- .github/workflows/rust.yml | 6 ++++++ src/lib.rs | 41 ++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7731d4c..eef84a9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -7,7 +7,11 @@ on: branches: [ "master" ] env: + CARGO_INCREMENTAL: 0 CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + RUSTFLAGS: -D warnings + RUSTDOCFLAGS: -D warnings jobs: build: @@ -18,6 +22,8 @@ jobs: run: cargo build --verbose - name: Run tests run: cargo test --verbose + - name: Build docs + run: cargo doc regen: runs-on: ubuntu-latest steps: diff --git a/src/lib.rs b/src/lib.rs index 45d97e0..4e3813a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,11 @@ // except according to those terms. //! Determine displayed width of `char` and `str` types according to -//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) -//! rules. +//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/), +//! other portions of the Unicode standard, and common implementations of +//! POSIX [`wcwidth()`](https://pubs.opengroup.org/onlinepubs/9699919799/). +//! See the [Rules for determining width](#rules-for-determining-width) section +//! for the exact rules. //! //! ```rust //! extern crate unicode_width; @@ -41,6 +44,34 @@ //! [dependencies] //! unicode-width = "0.1.5" //! ``` +//! # Rules for determining width +//! +//! This crate currently uses the following rules to determine the width of a +//! character or string, in order of decreasing precedence. These may be tweaked in the future. +//! +//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence) +//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.) +//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1. +//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. +//! 4. The following have width 0: +//! 1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AHangul_Syllable_Type%253D%252FV%7CT%252F%253A%5D) +//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593) +//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`), +//! 2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253ADefault_Ignorable_Code_Point%253DYes%253A%5D) +//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property, +//! 3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253D%252FMn%7CMe%252F%253A%5D) +//! with a [`General_Category`](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142) +//! of `Nonspacing_Mark` (`Mn`) or `Enclosing_Mark` (`Me`), and +//! 4. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000). +//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253DCc%253A%5D) +//! have no defined width, and are considered to have width 0 when contained within a string. +//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253D%252FF%7CW%252F%253A%5D) +//! with an [`East_Asian_Width`](https://www.unicode.org/reports/tr11/#ED1) of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2) +//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2. +//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253DA%253A%5D) +//! with an `East_Asian_Width` of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6) +//! have width 2 in an East Asian context, and width 1 otherwise. +//! 8. All other characters have width 1. #![forbid(unsafe_code)] #![deny(missing_docs)] @@ -110,8 +141,7 @@ pub trait UnicodeWidthStr { /// as 1 column wide. This is consistent with the recommendations for /// non-CJK contexts, or when the context cannot be reliably determined. /// - /// Also consistent with UAX11, this function treats [emoji presentation sequences] - /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// Also consistent with UAX11, this function treats [emoji presentation sequences](https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) /// as 2 columns wide. This means that the width of a string may not equal /// the sum of the widths of its individual characters. fn width(&self) -> usize; @@ -125,8 +155,7 @@ pub trait UnicodeWidthStr { /// as 2 column wide. This is consistent with the recommendations for /// CJK contexts. /// - /// Also consistent with UAX11, this function treats [emoji presentation sequences] - /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// Also consistent with UAX11, this function treats [emoji presentation sequences](https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) /// as 2 columns wide. This means that the width of a string may not equal /// the sum of the widths of its individual characters. fn width_cjk(&self) -> usize; From 5d8bc25dffec73d2bae2bd3ef0276578634d53f2 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Wed, 14 Feb 2024 12:18:36 -0500 Subject: [PATCH 05/13] Add more CI checks --- .github/workflows/rust.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index eef84a9..7f2c9e9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -24,6 +24,10 @@ jobs: run: cargo test --verbose - name: Build docs run: cargo doc + - name: Check formatting + run: cargo fmt --check + - name: Check clippy + run: cargo clippy regen: runs-on: ubuntu-latest steps: From 6beb76f328c6f6eb01a659a4bda9a447a71716b0 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Thu, 15 Feb 2024 19:07:42 -0500 Subject: [PATCH 06/13] Add emoji benchmark --- .gitignore | 1 + src/tests.rs | 50 ++++++++++++++++++++------------------------------ 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 2d7d550..12e0bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ Cargo.lock scripts/tmp scripts/*.txt scripts/*.rs +bench_data/* diff --git a/src/tests.rs b/src/tests.rs index 33815c8..5b22a60 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -9,13 +9,12 @@ // except according to those terms. #[cfg(feature = "bench")] -use super::{UnicodeWidthChar, UnicodeWidthStr}; -#[cfg(feature = "bench")] -use std::iter; +use std::{iter, string::String}; + #[cfg(feature = "bench")] -use test::Bencher; +use test::{self, Bencher}; -use std::prelude::v1::*; +use super::{UnicodeWidthChar, UnicodeWidthStr}; #[cfg(feature = "bench")] #[bench] @@ -95,6 +94,7 @@ fn simple_width_match(c: char) -> Option { } #[cfg(feature = "bench")] #[bench] + fn enwik8(b: &mut Bencher) { // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip let data_path = "bench_data/enwik8"; @@ -103,13 +103,25 @@ fn enwik8(b: &mut Bencher) { } #[cfg(feature = "bench")] #[bench] + fn jawiki(b: &mut Bencher) { - // To benchmark, download & extract `jawiki-20220501-pages-articles-multistream-index.txt` from - // https://dumps.wikimedia.org/jawiki/20220501/jawiki-20220501-pages-articles-multistream-index.txt.bz2 - let data_path = "bench_data/jawiki-20220501-pages-articles-multistream-index.txt"; + // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from + // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2 + let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt"; let string = std::fs::read_to_string(data_path).unwrap_or_default(); b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); } + +#[cfg(feature = "bench")] +#[bench] + +fn emoji(b: &mut Bencher) { + // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt + let data_path = "bench_data/emoji-style.txt"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} + #[test] fn test_str() { use super::UnicodeWidthStr; @@ -130,8 +142,6 @@ fn test_str() { #[test] fn test_emoji() { // Example from the README. - use super::UnicodeWidthStr; - assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist @@ -139,8 +149,6 @@ fn test_emoji() { #[test] fn test_char() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('h'), Some(2)); assert_eq!('h'.width_cjk(), Some(2)); assert_eq!(UnicodeWidthChar::width('\x00'), Some(0)); @@ -153,8 +161,6 @@ fn test_char() { #[test] fn test_char2() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\x00'), Some(0)); assert_eq!('\x00'.width_cjk(), Some(0)); @@ -182,15 +188,11 @@ fn test_char2() { #[test] fn unicode_12() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{1F971}'), Some(2)); } #[test] fn test_default_ignorable() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{E0000}'), Some(0)); assert_eq!(UnicodeWidthChar::width('\u{1160}'), Some(0)); @@ -200,8 +202,6 @@ fn test_default_ignorable() { #[test] fn test_jamo() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{1100}'), Some(2)); assert_eq!(UnicodeWidthChar::width('\u{A97C}'), Some(2)); // Special case: U+115F HANGUL CHOSEONG FILLER @@ -214,8 +214,6 @@ fn test_jamo() { #[test] fn test_prepended_concatenation_marks() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{0600}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{070F}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{08E2}'), Some(1)); @@ -224,8 +222,6 @@ fn test_prepended_concatenation_marks() { #[test] fn test_interlinear_annotation_chars() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{FFF9}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FFFA}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FFFB}'), Some(1)); @@ -233,8 +229,6 @@ fn test_interlinear_annotation_chars() { #[test] fn test_hieroglyph_format_controls() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{13430}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{13436}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{1343C}'), Some(1)); @@ -275,10 +269,6 @@ fn test_canonical_equivalence() { #[test] fn test_emoji_presentation() { - use super::{UnicodeWidthChar, UnicodeWidthStr}; - #[cfg(feature = "no_std")] - use core::option::Option::Some; - assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FE0F}'), Some(0)); assert_eq!(UnicodeWidthStr::width("\u{0023}\u{FE0F}"), 2); From ad55481fa7e4f528b75a4b3ac81540ebf1517e5a Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 5 Mar 2024 12:33:17 -0500 Subject: [PATCH 07/13] Address review comments --- scripts/unicode.py | 36 ++++++++++++++++++++++++++++-------- src/tables.rs | 23 ++++++++++++++++++----- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 4195c65..4ad2139 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -69,7 +69,7 @@ def fetch_open(filename: str): fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure. """ basename = os.path.basename(filename) - if not os.path.exists(os.path.basename(filename)): + if not os.path.exists(basename): os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") try: return open(basename, encoding="utf-8") @@ -417,6 +417,8 @@ def load_variation_sequences() -> "list[int]": an emoji presentation sequence.""" with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: + # Match all emoji presentation sequences + # (one codepoint followed by U+FE0F, and labeled "emoji style") sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") codepoints = [] for line in sequences.readlines(): @@ -452,7 +454,12 @@ def make_variation_sequence_table( # So store their info only when it wouldn't inflate the size of the tables. keys = list(prefixes_dict.keys()) for k in keys: - if all(map(lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, prefixes_dict[k])): + if all( + map( + lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, + prefixes_dict[k], + ) + ): del prefixes_dict[k] # Another assumption made by the data structure. @@ -470,7 +477,7 @@ def make_variation_sequence_table( index.append(next) leaves = [] - for leaf_idx, cps in enumerate(prefixes_dict.values()): + for cps in prefixes_dict.values(): leaf = [0] * 64 for cp in cps: idx_in_leaf, bit_shift = divmod(cp, 8) @@ -572,19 +579,32 @@ def emit_module( #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool {{ let cp: u32 = c.into(); - let Ok(top_byte): Result = ((cp) >> 9).try_into() else {{ + + // The largest codepoint for which this function returns `true` + // has 17 significant bits. Extract the most significant 8 of these, + // or return `false` if `cp` is outside this range. + let Ok(top_byte): Result = (cp >> 9).try_into() else {{ return false; }}; + // Use the byte from above to obtain the corresponding 4-bit index + // from the indexes table. let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; - if index_nibble >= {len(variation_leaves)} {{ + + // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed) + // equal to `top_byte` can change width when part of an emoji presentation seq, + // so return `false`. + let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else {{ return false; - }} + }}; - let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)] - [usize::try_from((cp >> 3) & 0x3F).unwrap()]; + // Extract the 3-8th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap(); + let leaf_byte = leaf_row[leaf_row_idx]; + // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 }} """ diff --git a/src/tables.rs b/src/tables.rs index 26da4be..83e974c 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -65,19 +65,32 @@ pub mod charwidth { #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool { let cp: u32 = c.into(); - let Ok(top_byte): Result = ((cp) >> 9).try_into() else { + + // The largest codepoint for which this function returns `true` + // has 17 significant bits. Extract the most significant 8 of these, + // or return `false` if `cp` is outside this range. + let Ok(top_byte): Result = (cp >> 9).try_into() else { return false; }; + // Use the byte from above to obtain the corresponding 4-bit index + // from the indexes table. let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; - if index_nibble >= 11 { + + // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed) + // equal to `top_byte` can change width when part of an emoji presentation seq, + // so return `false`. + let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else { return false; - } + }; - let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)] - [usize::try_from((cp >> 3) & 0x3F).unwrap()]; + // Extract the 3-8th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap(); + let leaf_byte = leaf_row[leaf_row_idx]; + // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 } From 4f80b57dd6c1b57e783f29fb1b43cfbd7be463a0 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 5 Mar 2024 13:29:22 -0500 Subject: [PATCH 08/13] Use `match` instead of array for first level of tree --- scripts/unicode.py | 92 ++++++++++------------------------- src/tables.rs | 119 ++++++++++++++++++--------------------------- 2 files changed, 71 insertions(+), 140 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 4ad2139..fafe1a8 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -432,22 +432,13 @@ def make_variation_sequence_table( seqs: "list[int]", width_map, ) -> "tuple[list[int], list[list[int]]]": - """Generates 2-level look up table for whether a codepoint might start an emoji presentation sequence. + """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence. (Characters that are always wide may be excluded.) - First level maps the most significant byte to a 4-bit index (or 0xFF if can't possibly start such a sequence), - second level is a bit array (each leaf is 512 bits long).""" - # The structure of the table currently relies on this. - # It's unlikely to be a problem in the near future - # as this is enough to encompass the entire Basic Multilingual Plane and - # Supplementary Multilingual Plane. - # And the fix is easy if it ever does become a problem: - # just check bits 1 more significant for the index, - # and use 1024-bit leaves instead of 512-bit. - assert seqs[-1] <= 0x1FFFF + The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.""" prefixes_dict = defaultdict(list) for cp in seqs: - prefixes_dict[cp >> 9].append(cp & 0x1FF) + prefixes_dict[cp >> 10].append(cp & 0x3FF) # We don't strictly need to keep track of characters that are always wide, # because being in an emoji variation seq won't affect their width. @@ -456,34 +447,22 @@ def make_variation_sequence_table( for k in keys: if all( map( - lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, + lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE, prefixes_dict[k], ) ): del prefixes_dict[k] - # Another assumption made by the data structure. - # Ensures 4 bits are enough to index into subtable - assert len(prefixes_dict.keys()) <= 15 - index_nibbles = [0xF] * 256 - for idx, k in enumerate(prefixes_dict.keys()): - index_nibbles[k] = idx - - index = [] - for tup in batched(index_nibbles, 2): - next = 0 - for i in range(0, 2): - next |= tup[i] << (4 * i) - index.append(next) + print(prefixes_dict) leaves = [] for cps in prefixes_dict.values(): - leaf = [0] * 64 + leaf = [0] * 128 for cp in cps: idx_in_leaf, bit_shift = divmod(cp, 8) leaf[idx_in_leaf] |= 1 << bit_shift leaves.append(leaf) - return (index, leaves) + return (list(prefixes_dict.keys()), leaves) def emit_module( @@ -580,29 +559,23 @@ def emit_module( pub fn starts_emoji_presentation_seq(c: char) -> bool {{ let cp: u32 = c.into(); - // The largest codepoint for which this function returns `true` - // has 17 significant bits. Extract the most significant 8 of these, - // or return `false` if `cp` is outside this range. - let Ok(top_byte): Result = (cp >> 9).try_into() else {{ - return false; - }}; + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits {{ +""" + ) - // Use the byte from above to obtain the corresponding 4-bit index - // from the indexes table. - let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; - let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; + for i, msbs in enumerate(variation_idx): + module.write(f" {msbs} => {i},\n") - // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed) - // equal to `top_byte` can change width when part of an emoji presentation seq, - // so return `false`. - let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else {{ - return false; + module.write( + f""" _ => return false, }}; - // Extract the 3-8th (0-indexed) least significant bits of `cp`, + // Extract the 3-9th (0-indexed) least significant bits of `cp`, // and use them to index into `leaf_row`. - let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap(); - let leaf_byte = leaf_row[leaf_row_idx]; + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf]; // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 @@ -663,24 +636,9 @@ def emit_module( module.write( f""" - /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0) - /// of the char you want to test. 0xF means it's not part of a presentation seq, - /// anything else means index into the next table. - static EMOJI_PRESENTATION_INDEX: [u8; {len(variation_idx)}] = [ -""" - ) - for row in batched(variation_idx, 15): - module.write(" ") - for idx in row: - module.write(f" 0x{idx:02X},") - module.write("\n") - module.write(" ];\n") - - module.write( - f""" - /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) - /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq. - static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; {len(variation_leaves)}] = [ + /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; {len(variation_leaves)}] = [ """ ) for leaf in variation_leaves: @@ -739,11 +697,11 @@ def main(module_filename: str): size_bytes = len(table.to_bytes()) print(f"Table {i} size: {size_bytes} bytes") total_size += size_bytes - emoji_index_size = len(variation_table[0]) - print(f"Emoji Presentation Index Size: {emoji_index_size} bytes") + emoji_index_size = len(variation_table[0]) * 4 + print(f"Emoji presentation index size: {emoji_index_size} bytes") total_size += emoji_index_size emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0]) - print(f"Emoji Presentation Leaves Size: {emoji_leaves_size} bytes") + print(f"Emoji presentation leaves Size: {emoji_leaves_size} bytes") total_size += emoji_leaves_size print("------------------------") print(f" Total Size: {total_size} bytes") diff --git a/src/tables.rs b/src/tables.rs index 83e974c..db89a7d 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -66,29 +66,22 @@ pub mod charwidth { pub fn starts_emoji_presentation_seq(c: char) -> bool { let cp: u32 = c.into(); - // The largest codepoint for which this function returns `true` - // has 17 significant bits. Extract the most significant 8 of these, - // or return `false` if `cp` is outside this range. - let Ok(top_byte): Result = (cp >> 9).try_into() else { - return false; + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits { + 0 => 0, + 8 => 1, + 9 => 2, + 10 => 3, + 124 => 4, + 125 => 5, + _ => return false, }; - // Use the byte from above to obtain the corresponding 4-bit index - // from the indexes table. - let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; - let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; - - // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed) - // equal to `top_byte` can change width when part of an emoji presentation seq, - // so return `false`. - let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else { - return false; - }; - - // Extract the 3-8th (0-indexed) least significant bits of `cp`, + // Extract the 3-9th (0-indexed) least significant bits of `cp`, // and use them to index into `leaf_row`. - let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap(); - let leaf_byte = leaf_row[leaf_row_idx]; + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf]; // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 @@ -576,100 +569,80 @@ pub mod charwidth { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, ]; - /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0) - /// of the char you want to test. 0xF means it's not part of a presentation seq, - /// anything else means index into the next table. - static EMOJI_PRESENTATION_INDEX: [u8; 128] = [ - 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x21, 0x43, 0x65, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0xA9, 0xFF, 0xFF, - ]; - - /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) - /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq. - static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; 11] = [ + /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; 6] = [ [ 0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, ], [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x03, 0x00, 0x06, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - ], - [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, 0x0F, 0x07, + 0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, + 0x0F, 0x07, ], [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x40, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, - ], - [ - 0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4, 0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01, - 0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A, 0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00, - 0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF, 0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50, - 0xB8, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4, + 0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01, 0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A, + 0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00, 0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF, + 0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50, 0xB8, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, ], [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - ], - [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, ], [ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - ], - [ - 0x04, 0x00, 0x00, 0x04, 0x00, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, - 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, 0xA8, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x80, + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, + 0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, + 0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, + 0xA8, 0x00, ], [ 0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84, 0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10, - 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, - ], - [ - 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x80, 0x20, 0x12, 0x01, 0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, - 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x20, 0x12, 0x01, + 0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, ], ]; } From d944bdd3e7f1b1973a4442c0a1c4559b93905c4a Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 5 Mar 2024 13:48:07 -0500 Subject: [PATCH 09/13] Spuriously treat certain always-wide characters as eligible for emoji presentation --- scripts/unicode.py | 41 ++++++++++++++++++++++++----------------- src/tables.rs | 38 +++++++++++++++++++------------------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index fafe1a8..00b1aea 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -434,17 +434,17 @@ def make_variation_sequence_table( ) -> "tuple[list[int], list[list[int]]]": """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence. (Characters that are always wide may be excluded.) - The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.""" + The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB. + """ - prefixes_dict = defaultdict(list) + prefixes_dict = defaultdict(set) for cp in seqs: - prefixes_dict[cp >> 10].append(cp & 0x3FF) + prefixes_dict[cp >> 10].add(cp & 0x3FF) # We don't strictly need to keep track of characters that are always wide, # because being in an emoji variation seq won't affect their width. # So store their info only when it wouldn't inflate the size of the tables. - keys = list(prefixes_dict.keys()) - for k in keys: + for k in list(prefixes_dict.keys()): if all( map( lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE, @@ -453,7 +453,14 @@ def make_variation_sequence_table( ): del prefixes_dict[k] - print(prefixes_dict) + indexes = list(prefixes_dict.keys()) + + # Similarly, we can spuriously return `true` for always-wide characters + # even if not part of a presentation seq; this saves an additional lookup, + # so we should do it where there is no size cost. + for cp, width in enumerate(width_map): + if width == EffectiveWidth.WIDE and (cp >> 10) in indexes: + prefixes_dict[cp >> 10].add(cp & 0x3FF) leaves = [] for cps in prefixes_dict.values(): @@ -462,7 +469,7 @@ def make_variation_sequence_table( idx_in_leaf, bit_shift = divmod(cp, 8) leaf[idx_in_leaf] |= 1 << bit_shift leaves.append(leaf) - return (list(prefixes_dict.keys()), leaves) + return (indexes, leaves) def emit_module( @@ -549,19 +556,19 @@ def emit_module( variation_idx, variation_leaves = variation_table module.write( - f""" + """ /// Whether this character forms an [emoji presentation sequence] /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) - /// when followed by `'\\u{{FEOF}}'`. + /// when followed by `'\\u{FEOF}'`. /// Emoji presentation sequences are considered to have width 2. - /// This may spuriously return `false` for all characters that are always wide. + /// This may spuriously return `true` or `false` for characters that are always wide. #[inline] - pub fn starts_emoji_presentation_seq(c: char) -> bool {{ + pub fn starts_emoji_presentation_seq(c: char) -> bool { let cp: u32 = c.into(); // First level of lookup uses all but 10 LSB let top_bits = cp >> 10; - let idx_of_leaf: usize = match top_bits {{ + let idx_of_leaf: usize = match top_bits { """ ) @@ -569,8 +576,8 @@ def emit_module( module.write(f" {msbs} => {i},\n") module.write( - f""" _ => return false, - }}; + """ _ => return false, + }; // Extract the 3-9th (0-indexed) least significant bits of `cp`, // and use them to index into `leaf_row`. @@ -579,7 +586,7 @@ def emit_module( // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 - }} + } """ ) @@ -701,10 +708,10 @@ def main(module_filename: str): print(f"Emoji presentation index size: {emoji_index_size} bytes") total_size += emoji_index_size emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0]) - print(f"Emoji presentation leaves Size: {emoji_leaves_size} bytes") + print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes") total_size += emoji_leaves_size print("------------------------") - print(f" Total Size: {total_size} bytes") + print(f" Total size: {total_size} bytes") emit_module(module_filename, version, tables, variation_table) print(f'Wrote to "{module_filename}"') diff --git a/src/tables.rs b/src/tables.rs index db89a7d..7e1c899 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -61,7 +61,7 @@ pub mod charwidth { /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) /// when followed by `'\u{FEOF}'`. /// Emoji presentation sequences are considered to have width 2. - /// This may spuriously return `false` for all characters that are always wide. + /// This may spuriously return `true` or `false` for characters that are always wide. #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool { let cp: u32 = c.into(); @@ -592,7 +592,7 @@ pub mod charwidth { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x0C, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, 0x0F, 0x07, ], @@ -622,27 +622,27 @@ pub mod charwidth { ], [ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x80, - 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, - 0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, - 0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, - 0xA8, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x40, 0xFE, 0x07, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x0F, 0xFF, 0x01, 0x03, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, + 0xFF, 0xFF, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xCF, 0xCE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xB9, 0xFF, ], [ - 0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04, - 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84, - 0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, - 0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10, - 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x20, 0x12, 0x01, - 0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xBF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0x00, 0x7E, + 0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x07, 0x80, 0x3C, 0x61, 0x00, 0x30, 0x01, 0x06, 0x10, + 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0xF8, 0xE7, 0xF0, 0x3F, 0x1A, 0xF9, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x0F, + 0x01, 0x00, ], ]; } From a8b2fabb8e3ccc14f0abadff0c447a09c78a3ff9 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Tue, 5 Mar 2024 15:31:40 -0500 Subject: [PATCH 10/13] Align `EMOJI_PRESENTATION_LEAVES` to 128 bytes Ensure rows don't cross cache lines, makes a small difference in the benchmarks --- scripts/unicode.py | 9 ++++++--- src/tables.rs | 9 ++++++--- src/tests.rs | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 00b1aea..ed17229 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -582,7 +582,7 @@ def emit_module( // Extract the 3-9th (0-indexed) least significant bits of `cp`, // and use them to index into `leaf_row`. let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); - let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf]; + let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 @@ -643,9 +643,12 @@ def emit_module( module.write( f""" + #[repr(align(128))] + struct Align128(T); + /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. - static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; {len(variation_leaves)}] = [ + static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([ """ ) for leaf in variation_leaves: @@ -657,7 +660,7 @@ def emit_module( module.write("\n") module.write(" ],\n") - module.write(" ];\n") + module.write(" ]);\n") module.write("}\n") diff --git a/src/tables.rs b/src/tables.rs index 7e1c899..fa76684 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -81,7 +81,7 @@ pub mod charwidth { // Extract the 3-9th (0-indexed) least significant bits of `cp`, // and use them to index into `leaf_row`. let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); - let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf]; + let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; // Use the 3 LSB of `cp` to index into `leaf_byte`. ((leaf_byte >> (cp & 7)) & 1) == 1 @@ -569,9 +569,12 @@ pub mod charwidth { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, ]; + #[repr(align(128))] + struct Align128(T); + /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. - static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; 6] = [ + static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; 6]> = Align128([ [ 0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -644,5 +647,5 @@ pub mod charwidth { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x0F, 0x01, 0x00, ], - ]; + ]); } diff --git a/src/tests.rs b/src/tests.rs index 5b22a60..676068f 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -12,7 +12,7 @@ use std::{iter, string::String}; #[cfg(feature = "bench")] -use test::{self, Bencher}; +use test::Bencher; use super::{UnicodeWidthChar, UnicodeWidthStr}; From a5066aaffee62fb116101869d713844a053ae302 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 22 Apr 2024 15:34:31 -0400 Subject: [PATCH 11/13] Convert tests into integration tests --- .github/workflows/rust.yml | 2 +- Cargo.toml | 1 - benches/benches.rs | 113 ++++++++++++++++++++++++++++++++++ src/lib.rs | 11 ---- {src => tests}/tests.rs | 120 +------------------------------------ 5 files changed, 115 insertions(+), 132 deletions(-) create mode 100644 benches/benches.rs rename {src => tests}/tests.rs (65%) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7f2c9e9..c0908cb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -27,7 +27,7 @@ jobs: - name: Check formatting run: cargo fmt --check - name: Check clippy - run: cargo clippy + run: cargo clippy --lib --tests regen: runs-on: ubuntu-latest steps: diff --git a/Cargo.toml b/Cargo.toml index 7c44aa6..49e7539 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,6 @@ unicode-normalization = "0.1.23" [features] default = [] -bench = [] rustc-dep-of-std = ['std', 'core', 'compiler_builtins'] # Legacy, now a no-op diff --git a/benches/benches.rs b/benches/benches.rs new file mode 100644 index 0000000..c91cef4 --- /dev/null +++ b/benches/benches.rs @@ -0,0 +1,113 @@ +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +#![feature(test)] + +extern crate test; + +use std::iter; + +use test::Bencher; + +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; + +#[bench] +fn cargo(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(UnicodeWidthChar::width(c)); + } + }); +} + +#[bench] +fn stdlib(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(c.width()); + } + }); +} + +#[bench] +fn simple_if(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(simple_width_if(c)); + } + }); +} + +#[bench] +fn simple_match(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(simple_width_match(c)); + } + }); +} + +#[inline] +fn simple_width_if(c: char) -> Option { + let cu = c as u32; + if cu < 127 { + if cu > 31 { + Some(1) + } else if cu == 0 { + Some(0) + } else { + None + } + } else { + UnicodeWidthChar::width(c) + } +} + +#[inline] +fn simple_width_match(c: char) -> Option { + match c as u32 { + cu if cu == 0 => Some(0), + cu if cu < 0x20 => None, + cu if cu < 0x7f => Some(1), + _ => UnicodeWidthChar::width(c), + } +} + +#[bench] +fn enwik8(b: &mut Bencher) { + // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip + let data_path = "bench_data/enwik8"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} + +#[bench] +fn jawiki(b: &mut Bencher) { + // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from + // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2 + let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} + +#[bench] +fn emoji(b: &mut Bencher) { + // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt + let data_path = "bench_data/emoji-style.txt"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} diff --git a/src/lib.rs b/src/lib.rs index 4e3813a..1ca6bb5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,24 +79,13 @@ html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" )] -#![cfg_attr(feature = "bench", feature(test))] #![no_std] -#[cfg(test)] -#[macro_use] -extern crate std; - -#[cfg(feature = "bench")] -extern crate test; - use tables::charwidth as cw; pub use tables::UNICODE_VERSION; mod tables; -#[cfg(test)] -mod tests; - /// Methods for determining displayed width of Unicode characters. pub trait UnicodeWidthChar { /// Returns the character's displayed width in columns, or `None` if the diff --git a/src/tests.rs b/tests/tests.rs similarity index 65% rename from src/tests.rs rename to tests/tests.rs index 676068f..47218e4 100644 --- a/src/tests.rs +++ b/tests/tests.rs @@ -8,124 +8,10 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#[cfg(feature = "bench")] -use std::{iter, string::String}; - -#[cfg(feature = "bench")] -use test::Bencher; - -use super::{UnicodeWidthChar, UnicodeWidthStr}; - -#[cfg(feature = "bench")] -#[bench] -fn cargo(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(UnicodeWidthChar::width(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -#[allow(deprecated)] -fn stdlib(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(c.width()); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -fn simple_if(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(simple_width_if(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -fn simple_match(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(simple_width_match(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[inline] -fn simple_width_if(c: char) -> Option { - let cu = c as u32; - if cu < 127 { - if cu > 31 { - Some(1) - } else if cu == 0 { - Some(0) - } else { - None - } - } else { - UnicodeWidthChar::width(c) - } -} - -#[cfg(feature = "bench")] -#[inline] -fn simple_width_match(c: char) -> Option { - match c as u32 { - cu if cu == 0 => Some(0), - cu if cu < 0x20 => None, - cu if cu < 0x7f => Some(1), - _ => UnicodeWidthChar::width(c), - } -} -#[cfg(feature = "bench")] -#[bench] - -fn enwik8(b: &mut Bencher) { - // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip - let data_path = "bench_data/enwik8"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} -#[cfg(feature = "bench")] -#[bench] - -fn jawiki(b: &mut Bencher) { - // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from - // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2 - let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} - -#[cfg(feature = "bench")] -#[bench] - -fn emoji(b: &mut Bencher) { - // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt - let data_path = "bench_data/emoji-style.txt"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; #[test] fn test_str() { - use super::UnicodeWidthStr; - assert_eq!(UnicodeWidthStr::width("hello"), 10); assert_eq!("hello".width_cjk(), 10); assert_eq!(UnicodeWidthStr::width("\0\0\0\x01\x01"), 0); @@ -236,8 +122,6 @@ fn test_hieroglyph_format_controls() { #[test] fn test_marks() { - use super::UnicodeWidthChar; - // Nonspacing marks have 0 width assert_eq!(UnicodeWidthChar::width('\u{0301}'), Some(0)); // Enclosing marks have 0 width @@ -250,8 +134,6 @@ fn test_marks() { #[test] fn test_canonical_equivalence() { - use super::{UnicodeWidthChar, UnicodeWidthStr}; - for c in '\0'..='\u{10FFFF}' { let mut nfd = String::new(); unicode_normalization::char::decompose_canonical(c, |d| nfd.push(d)); From 5e8bf9b882c2a52c77ca0d841d73f45370048982 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 22 Apr 2024 16:01:07 -0400 Subject: [PATCH 12/13] Update docs to mention `Grapheme_Extend` --- src/lib.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1ca6bb5..d952880 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,21 +54,28 @@ //! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1. //! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. //! 4. The following have width 0: -//! 1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AHangul_Syllable_Type%253D%252FV%7CT%252F%253A%5D) +//! 1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) //! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593) //! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`), -//! 2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253ADefault_Ignorable_Code_Point%253DYes%253A%5D) +//! 2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) //! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property, -//! 3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253D%252FMn%7CMe%252F%253A%5D) -//! with a [`General_Category`](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142) -//! of `Nonspacing_Mark` (`Mn`) or `Enclosing_Mark` (`Me`), and -//! 4. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000). -//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253DCc%253A%5D) +//! 3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) +//! with the [`Grapheme_Extend`](https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf#G52443) property, +//! 4. [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0), +//! [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7), +//! [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8), +//! [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA), +//! [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB), +//! [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B), +//! [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and +//! [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43), +//! 5. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000). +//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D) //! have no defined width, and are considered to have width 0 when contained within a string. -//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253D%252FF%7CW%252F%253A%5D) +//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DW%7D%5Cp%7BEast_Asian_Width%3DF%7D) //! with an [`East_Asian_Width`](https://www.unicode.org/reports/tr11/#ED1) of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2) //! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2. -//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253DA%253A%5D) +//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) //! with an `East_Asian_Width` of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6) //! have width 2 in an East Asian context, and width 1 otherwise. //! 8. All other characters have width 1. From 46a60670d7307e5b0a6108c319a4d399c460b74a Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 22 Apr 2024 16:07:41 -0400 Subject: [PATCH 13/13] Update unicode.py commendt to match new rules --- scripts/unicode.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index ed17229..b50d40f 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -674,7 +674,9 @@ def main(module_filename: str): - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c) - Hangul jamo medial vowels & final consonants are zero-width. - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. - - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width. + - Control characters are zero-width. + - `Grapheme_Extend` characters, as well as eight spacing marks that canonically decompose to `Grapheme_Extend` characters, + are zero-width. - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width