From 130f3fde22fe9bed5da40c55a93430ce262e9a4f Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Sat, 10 Feb 2024 16:43:23 -0500
Subject: [PATCH 01/13] Treat emoji presentation sequences as fullwidth

---
 scripts/unicode.py |  73 ++++++++++++++--
 src/lib.rs         |  31 ++++++-
 src/tables.rs      | 209 +++++++++++++++++++++++++++++++++++++++++++++
 src/tests.rs       |  14 +++
 4 files changed, 320 insertions(+), 7 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index e91f001..fe0236c 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -66,12 +66,13 @@ def fetch_open(filename: str):
     """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
     fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
     """
+    basename = os.path.basename(filename)
     if not os.path.exists(os.path.basename(filename)):
         os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
     try:
-        return open(filename, encoding="utf-8")
+        return open(basename, encoding="utf-8")
     except OSError:
-        sys.stderr.write(f"cannot load {filename}")
+        sys.stderr.write(f"cannot load {basename}")
         sys.exit(1)
 
 
@@ -152,7 +153,8 @@ def load_zero_widths() -> "list[bool]":
 
     - it is in general category `Cc`,
     - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
-    - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
+    - or if it is one of U+0CC0, U+0CC7, U+0CC8, U+0CCA, U+0CCB, U+1B3B, U+1B3D, or U+1B43,
+    - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`) and is not U+115F,
     - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
     """
 
@@ -408,8 +410,29 @@ def make_tables(
     return tables
 
 
+def variation_sequences() -> "list[tuple[int, int]]":
+    """Outputs a list of character ranages, corresponding to all the valid characters for starting
+    an emoji presentation sequence."""
+
+    with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
+        sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
+        ranges = []
+        for line in sequences.readlines():
+            if match := sequence.match(line):
+                cp = int(match.group(1), 16)
+                if ranges != [] and ranges[-1][1] == cp - 1:
+                    ranges[-1] = (ranges[-1][0], cp)
+                else:
+                    ranges.append((cp, cp))
+
+    return ranges
+
+
 def emit_module(
-    out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
+    out_name: str,
+    unicode_version: "tuple[int, int, int]",
+    tables: "list[Table]",
+    emoji_variations: "list[tuple[int, int]]",
 ):
     """Outputs a Rust module to `out_name` using table data from `tables`.
     If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -486,6 +509,31 @@ def emit_module(
 """
         )
 
+        module.write(
+            """
+    /// Whether this character forms an [emoji presentation sequence]
+    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// when followed by `'\\u{FEOF}'`.
+    /// Emoji presentation sequences are considered to have width 2.
+    #[inline]
+    pub fn starts_emoji_presentation_seq(c: char) -> bool {
+        use core::cmp::Ordering::{Equal, Greater, Less};
+
+        EMOJI_PRESENTATION_RANGES
+            .binary_search_by(|&(lo, hi)| {
+                if lo > c {
+                    Greater
+                } else if hi < c {
+                    Less
+                } else {
+                    Equal
+                }
+            })
+            .is_ok()
+    }
+"""
+        )
+
         module.write(
             """
     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -534,6 +582,20 @@ def emit_module(
                 module.write(f" 0x{byte:02X},")
             module.write("\n    ];\n")
             subtable_count = new_subtable_count
+
+        # emoji table
+
+        module.write(
+            f"""
+    /// Each tuple corresponds to a range (inclusive at both ends)
+    /// of characters that can start an emoji presentation sequence.
+    static EMOJI_PRESENTATION_RANGES: [(char, char); {len(emoji_variations)}] = [
+"""
+        )
+        for lo, hi in emoji_variations:
+            module.write(f"        ('\\u{{{lo:X}}}', '\\u{{{hi:X}}}'),\n")
+        module.write("    ];\n")
+
         module.write("}\n")
 
 
@@ -569,6 +631,7 @@ def main(module_filename: str):
     width_map[0x00AD] = EffectiveWidth.NARROW
 
     tables = make_tables(TABLE_CFGS, enumerate(width_map))
+    emoji_variations = variation_sequences()
 
     print("------------------------")
     total_size = 0
@@ -579,7 +642,7 @@ def main(module_filename: str):
     print("------------------------")
     print(f"  Total Size: {total_size} bytes")
 
-    emit_module(module_filename, version, tables)
+    emit_module(module_filename, version, tables, emoji_variations)
     print(f'Wrote to "{module_filename}"')
 
 
diff --git a/src/lib.rs b/src/lib.rs
index 2f22613..aec3b74 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -108,6 +108,11 @@ pub trait UnicodeWidthStr {
     /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
     /// as 1 column wide. This is consistent with the recommendations for
     /// non-CJK contexts, or when the context cannot be reliably determined.
+    ///
+    /// Also consistent with UAX11, this function treats [emoji presentation sequences]
+    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// as 2 columns wide. This means that the width of a string may not equal
+    /// the sum of the widths of its individual characters.
     fn width(&self) -> usize;
 
     /// Returns the string's displayed width in columns.
@@ -118,17 +123,39 @@ pub trait UnicodeWidthStr {
     /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
     /// as 2 column wide. This is consistent with the recommendations for
     /// CJK contexts.
+    ///
+    /// Also consistent with UAX11, this function treats [emoji presentation sequences]
+    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// as 2 columns wide. This means that the width of a string may not equal
+    /// the sum of the widths of its individual characters.
     fn width_cjk(&self) -> usize;
 }
 
 impl UnicodeWidthStr for str {
     #[inline]
     fn width(&self) -> usize {
-        self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum()
+        str_width(self, false)
     }
 
     #[inline]
     fn width_cjk(&self) -> usize {
-        self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum()
+        str_width(self, true)
     }
 }
+
+fn str_width(s: &str, is_cjk: bool) -> usize {
+    s.chars()
+        .rfold((0, false), |(sum, was_fe0f), c| {
+            if c == '\u{FE0F}' {
+                (sum, true)
+            } else {
+                let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) {
+                    2
+                } else {
+                    cw::width(c, is_cjk).unwrap_or(0)
+                };
+                (sum + add, false)
+            }
+        })
+        .0
+}
diff --git a/src/tables.rs b/src/tables.rs
index 8e2e9eb..4e1064d 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -57,6 +57,27 @@ pub mod charwidth {
         }
     }
 
+    /// Whether this character forms an [emoji presentation sequence]
+    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// when followed by `'\u{FEOF}'`.
+    /// Emoji presentation sequences are considered to have width 2.
+    #[inline]
+    pub fn starts_emoji_presentation_seq(c: char) -> bool {
+        use core::cmp::Ordering::{Equal, Greater, Less};
+
+        EMOJI_PRESENTATION_RANGES
+            .binary_search_by(|&(lo, hi)| {
+                if lo > c {
+                    Greater
+                } else if hi < c {
+                    Less
+                } else {
+                    Equal
+                }
+            })
+            .is_ok()
+    }
+
     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
     /// `None` if `c` is a control character other than `'\x00'`.
     /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
@@ -538,4 +559,192 @@ pub mod charwidth {
         0x55, 0xAA, 0xAA, 0x56, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F,
     ];
+
+    /// Each tuple corresponds to a range (inclusive at both ends)
+    /// of characters that can start an emoji presentation sequence.
+    static EMOJI_PRESENTATION_RANGES: [(char, char); 183] = [
+        ('\u{23}', '\u{23}'),
+        ('\u{2A}', '\u{2A}'),
+        ('\u{30}', '\u{39}'),
+        ('\u{A9}', '\u{A9}'),
+        ('\u{AE}', '\u{AE}'),
+        ('\u{203C}', '\u{203C}'),
+        ('\u{2049}', '\u{2049}'),
+        ('\u{2122}', '\u{2122}'),
+        ('\u{2139}', '\u{2139}'),
+        ('\u{2194}', '\u{2199}'),
+        ('\u{21A9}', '\u{21AA}'),
+        ('\u{231A}', '\u{231B}'),
+        ('\u{2328}', '\u{2328}'),
+        ('\u{23CF}', '\u{23CF}'),
+        ('\u{23E9}', '\u{23F3}'),
+        ('\u{23F8}', '\u{23FA}'),
+        ('\u{24C2}', '\u{24C2}'),
+        ('\u{25AA}', '\u{25AB}'),
+        ('\u{25B6}', '\u{25B6}'),
+        ('\u{25C0}', '\u{25C0}'),
+        ('\u{25FB}', '\u{25FE}'),
+        ('\u{2600}', '\u{2604}'),
+        ('\u{260E}', '\u{260E}'),
+        ('\u{2611}', '\u{2611}'),
+        ('\u{2614}', '\u{2615}'),
+        ('\u{2618}', '\u{2618}'),
+        ('\u{261D}', '\u{261D}'),
+        ('\u{2620}', '\u{2620}'),
+        ('\u{2622}', '\u{2623}'),
+        ('\u{2626}', '\u{2626}'),
+        ('\u{262A}', '\u{262A}'),
+        ('\u{262E}', '\u{262F}'),
+        ('\u{2638}', '\u{263A}'),
+        ('\u{2640}', '\u{2640}'),
+        ('\u{2642}', '\u{2642}'),
+        ('\u{2648}', '\u{2653}'),
+        ('\u{265F}', '\u{2660}'),
+        ('\u{2663}', '\u{2663}'),
+        ('\u{2665}', '\u{2666}'),
+        ('\u{2668}', '\u{2668}'),
+        ('\u{267B}', '\u{267B}'),
+        ('\u{267E}', '\u{267F}'),
+        ('\u{2692}', '\u{2697}'),
+        ('\u{2699}', '\u{2699}'),
+        ('\u{269B}', '\u{269C}'),
+        ('\u{26A0}', '\u{26A1}'),
+        ('\u{26A7}', '\u{26A7}'),
+        ('\u{26AA}', '\u{26AB}'),
+        ('\u{26B0}', '\u{26B1}'),
+        ('\u{26BD}', '\u{26BE}'),
+        ('\u{26C4}', '\u{26C5}'),
+        ('\u{26C8}', '\u{26C8}'),
+        ('\u{26CE}', '\u{26CF}'),
+        ('\u{26D1}', '\u{26D1}'),
+        ('\u{26D3}', '\u{26D4}'),
+        ('\u{26E9}', '\u{26EA}'),
+        ('\u{26F0}', '\u{26F5}'),
+        ('\u{26F7}', '\u{26FA}'),
+        ('\u{26FD}', '\u{26FD}'),
+        ('\u{2702}', '\u{2702}'),
+        ('\u{2705}', '\u{2705}'),
+        ('\u{2708}', '\u{270D}'),
+        ('\u{270F}', '\u{270F}'),
+        ('\u{2712}', '\u{2712}'),
+        ('\u{2714}', '\u{2714}'),
+        ('\u{2716}', '\u{2716}'),
+        ('\u{271D}', '\u{271D}'),
+        ('\u{2721}', '\u{2721}'),
+        ('\u{2728}', '\u{2728}'),
+        ('\u{2733}', '\u{2734}'),
+        ('\u{2744}', '\u{2744}'),
+        ('\u{2747}', '\u{2747}'),
+        ('\u{274C}', '\u{274C}'),
+        ('\u{274E}', '\u{274E}'),
+        ('\u{2753}', '\u{2755}'),
+        ('\u{2757}', '\u{2757}'),
+        ('\u{2763}', '\u{2764}'),
+        ('\u{2795}', '\u{2797}'),
+        ('\u{27A1}', '\u{27A1}'),
+        ('\u{27B0}', '\u{27B0}'),
+        ('\u{27BF}', '\u{27BF}'),
+        ('\u{2934}', '\u{2935}'),
+        ('\u{2B05}', '\u{2B07}'),
+        ('\u{2B1B}', '\u{2B1C}'),
+        ('\u{2B50}', '\u{2B50}'),
+        ('\u{2B55}', '\u{2B55}'),
+        ('\u{3030}', '\u{3030}'),
+        ('\u{303D}', '\u{303D}'),
+        ('\u{3297}', '\u{3297}'),
+        ('\u{3299}', '\u{3299}'),
+        ('\u{1F004}', '\u{1F004}'),
+        ('\u{1F170}', '\u{1F171}'),
+        ('\u{1F17E}', '\u{1F17F}'),
+        ('\u{1F202}', '\u{1F202}'),
+        ('\u{1F21A}', '\u{1F21A}'),
+        ('\u{1F22F}', '\u{1F22F}'),
+        ('\u{1F237}', '\u{1F237}'),
+        ('\u{1F30D}', '\u{1F30F}'),
+        ('\u{1F315}', '\u{1F315}'),
+        ('\u{1F31C}', '\u{1F31C}'),
+        ('\u{1F321}', '\u{1F321}'),
+        ('\u{1F324}', '\u{1F32C}'),
+        ('\u{1F336}', '\u{1F336}'),
+        ('\u{1F378}', '\u{1F378}'),
+        ('\u{1F37D}', '\u{1F37D}'),
+        ('\u{1F393}', '\u{1F393}'),
+        ('\u{1F396}', '\u{1F397}'),
+        ('\u{1F399}', '\u{1F39B}'),
+        ('\u{1F39E}', '\u{1F39F}'),
+        ('\u{1F3A7}', '\u{1F3A7}'),
+        ('\u{1F3AC}', '\u{1F3AE}'),
+        ('\u{1F3C2}', '\u{1F3C2}'),
+        ('\u{1F3C4}', '\u{1F3C4}'),
+        ('\u{1F3C6}', '\u{1F3C6}'),
+        ('\u{1F3CA}', '\u{1F3CE}'),
+        ('\u{1F3D4}', '\u{1F3E0}'),
+        ('\u{1F3ED}', '\u{1F3ED}'),
+        ('\u{1F3F3}', '\u{1F3F3}'),
+        ('\u{1F3F5}', '\u{1F3F5}'),
+        ('\u{1F3F7}', '\u{1F3F7}'),
+        ('\u{1F408}', '\u{1F408}'),
+        ('\u{1F415}', '\u{1F415}'),
+        ('\u{1F41F}', '\u{1F41F}'),
+        ('\u{1F426}', '\u{1F426}'),
+        ('\u{1F43F}', '\u{1F43F}'),
+        ('\u{1F441}', '\u{1F442}'),
+        ('\u{1F446}', '\u{1F449}'),
+        ('\u{1F44D}', '\u{1F44E}'),
+        ('\u{1F453}', '\u{1F453}'),
+        ('\u{1F46A}', '\u{1F46A}'),
+        ('\u{1F47D}', '\u{1F47D}'),
+        ('\u{1F4A3}', '\u{1F4A3}'),
+        ('\u{1F4B0}', '\u{1F4B0}'),
+        ('\u{1F4B3}', '\u{1F4B3}'),
+        ('\u{1F4BB}', '\u{1F4BB}'),
+        ('\u{1F4BF}', '\u{1F4BF}'),
+        ('\u{1F4CB}', '\u{1F4CB}'),
+        ('\u{1F4DA}', '\u{1F4DA}'),
+        ('\u{1F4DF}', '\u{1F4DF}'),
+        ('\u{1F4E4}', '\u{1F4E6}'),
+        ('\u{1F4EA}', '\u{1F4ED}'),
+        ('\u{1F4F7}', '\u{1F4F7}'),
+        ('\u{1F4F9}', '\u{1F4FB}'),
+        ('\u{1F4FD}', '\u{1F4FD}'),
+        ('\u{1F508}', '\u{1F508}'),
+        ('\u{1F50D}', '\u{1F50D}'),
+        ('\u{1F512}', '\u{1F513}'),
+        ('\u{1F549}', '\u{1F54A}'),
+        ('\u{1F550}', '\u{1F567}'),
+        ('\u{1F56F}', '\u{1F570}'),
+        ('\u{1F573}', '\u{1F579}'),
+        ('\u{1F587}', '\u{1F587}'),
+        ('\u{1F58A}', '\u{1F58D}'),
+        ('\u{1F590}', '\u{1F590}'),
+        ('\u{1F5A5}', '\u{1F5A5}'),
+        ('\u{1F5A8}', '\u{1F5A8}'),
+        ('\u{1F5B1}', '\u{1F5B2}'),
+        ('\u{1F5BC}', '\u{1F5BC}'),
+        ('\u{1F5C2}', '\u{1F5C4}'),
+        ('\u{1F5D1}', '\u{1F5D3}'),
+        ('\u{1F5DC}', '\u{1F5DE}'),
+        ('\u{1F5E1}', '\u{1F5E1}'),
+        ('\u{1F5E3}', '\u{1F5E3}'),
+        ('\u{1F5E8}', '\u{1F5E8}'),
+        ('\u{1F5EF}', '\u{1F5EF}'),
+        ('\u{1F5F3}', '\u{1F5F3}'),
+        ('\u{1F5FA}', '\u{1F5FA}'),
+        ('\u{1F610}', '\u{1F610}'),
+        ('\u{1F687}', '\u{1F687}'),
+        ('\u{1F68D}', '\u{1F68D}'),
+        ('\u{1F691}', '\u{1F691}'),
+        ('\u{1F694}', '\u{1F694}'),
+        ('\u{1F698}', '\u{1F698}'),
+        ('\u{1F6AD}', '\u{1F6AD}'),
+        ('\u{1F6B2}', '\u{1F6B2}'),
+        ('\u{1F6B9}', '\u{1F6BA}'),
+        ('\u{1F6BC}', '\u{1F6BC}'),
+        ('\u{1F6CB}', '\u{1F6CB}'),
+        ('\u{1F6CD}', '\u{1F6CF}'),
+        ('\u{1F6E0}', '\u{1F6E5}'),
+        ('\u{1F6E9}', '\u{1F6E9}'),
+        ('\u{1F6F0}', '\u{1F6F0}'),
+        ('\u{1F6F3}', '\u{1F6F3}'),
+    ];
 }
diff --git a/src/tests.rs b/src/tests.rs
index 9e3805b..e8f6686 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -272,3 +272,17 @@ fn test_canonical_equivalence() {
         //assert_eq!(c.width_cjk().unwrap_or(0), nfd.width_cjk(), "{c}, {nfd}");
     }
 }
+
+#[test]
+fn test_emoji_presentation() {
+    use super::{UnicodeWidthChar, UnicodeWidthStr};
+    #[cfg(feature = "no_std")]
+    use core::option::Option::Some;
+
+    assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1));
+    assert_eq!(UnicodeWidthChar::width('\u{FE0F}'), Some(0));
+    assert_eq!(UnicodeWidthStr::width("\u{0023}\u{FE0F}"), 2);
+    assert_eq!(UnicodeWidthStr::width("a\u{0023}\u{FE0F}a"), 4);
+    assert_eq!(UnicodeWidthStr::width("\u{0023}a\u{FE0F}"), 2);
+    assert_eq!(UnicodeWidthStr::width("a\u{FE0F}"), 1);
+}

From 6bd8215852ba0110950175aaf074892b381f756b Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Tue, 13 Feb 2024 20:49:17 -0500
Subject: [PATCH 02/13] emoji presentation: store single codepoints instead of
 ranges

---
 scripts/unicode.py |  42 ++---
 src/tables.rs      | 412 +++++++++++++++++++++++----------------------
 2 files changed, 230 insertions(+), 224 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index fe0236c..25bbd9e 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -410,29 +410,29 @@ def make_tables(
     return tables
 
 
-def variation_sequences() -> "list[tuple[int, int]]":
+def load_variation_sequences(width_map) -> "list[int]":
     """Outputs a list of character ranages, corresponding to all the valid characters for starting
-    an emoji presentation sequence."""
+    an emoji presentation sequence, exclusing those that are always wide."""
 
     with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
         sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
-        ranges = []
+        codepoints = []
         for line in sequences.readlines():
             if match := sequence.match(line):
                 cp = int(match.group(1), 16)
-                if ranges != [] and ranges[-1][1] == cp - 1:
-                    ranges[-1] = (ranges[-1][0], cp)
-                else:
-                    ranges.append((cp, cp))
-
-    return ranges
+                if width_map[cp] == EffectiveWidth.WIDE:
+                    # this character would be width 2 even outside a variation sequence,
+                    # so we don't need to store its info
+                    continue
+                codepoints.append(cp)
+    return codepoints
 
 
 def emit_module(
     out_name: str,
     unicode_version: "tuple[int, int, int]",
     tables: "list[Table]",
-    emoji_variations: "list[tuple[int, int]]",
+    emoji_variations: "list[int]",
 ):
     """Outputs a Rust module to `out_name` using table data from `tables`.
     If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -517,19 +517,7 @@ def emit_module(
     /// Emoji presentation sequences are considered to have width 2.
     #[inline]
     pub fn starts_emoji_presentation_seq(c: char) -> bool {
-        use core::cmp::Ordering::{Equal, Greater, Less};
-
-        EMOJI_PRESENTATION_RANGES
-            .binary_search_by(|&(lo, hi)| {
-                if lo > c {
-                    Greater
-                } else if hi < c {
-                    Less
-                } else {
-                    Equal
-                }
-            })
-            .is_ok()
+        EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok()
     }
 """
         )
@@ -589,11 +577,11 @@ def emit_module(
             f"""
     /// Each tuple corresponds to a range (inclusive at both ends)
     /// of characters that can start an emoji presentation sequence.
-    static EMOJI_PRESENTATION_RANGES: [(char, char); {len(emoji_variations)}] = [
+    static EMOJI_PRESENTATION_RANGES: [char; {len(emoji_variations)}] = [
 """
         )
-        for lo, hi in emoji_variations:
-            module.write(f"        ('\\u{{{lo:X}}}', '\\u{{{hi:X}}}'),\n")
+        for cp in emoji_variations:
+            module.write(f"        '\\u{{{cp:X}}}',\n")
         module.write("    ];\n")
 
         module.write("}\n")
@@ -631,7 +619,7 @@ def main(module_filename: str):
     width_map[0x00AD] = EffectiveWidth.NARROW
 
     tables = make_tables(TABLE_CFGS, enumerate(width_map))
-    emoji_variations = variation_sequences()
+    emoji_variations = load_variation_sequences(width_map)
 
     print("------------------------")
     total_size = 0
diff --git a/src/tables.rs b/src/tables.rs
index 4e1064d..1f92bdb 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -63,19 +63,7 @@ pub mod charwidth {
     /// Emoji presentation sequences are considered to have width 2.
     #[inline]
     pub fn starts_emoji_presentation_seq(c: char) -> bool {
-        use core::cmp::Ordering::{Equal, Greater, Less};
-
-        EMOJI_PRESENTATION_RANGES
-            .binary_search_by(|&(lo, hi)| {
-                if lo > c {
-                    Greater
-                } else if hi < c {
-                    Less
-                } else {
-                    Equal
-                }
-            })
-            .is_ok()
+        EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok()
     }
 
     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -562,189 +550,219 @@ pub mod charwidth {
 
     /// Each tuple corresponds to a range (inclusive at both ends)
     /// of characters that can start an emoji presentation sequence.
-    static EMOJI_PRESENTATION_RANGES: [(char, char); 183] = [
-        ('\u{23}', '\u{23}'),
-        ('\u{2A}', '\u{2A}'),
-        ('\u{30}', '\u{39}'),
-        ('\u{A9}', '\u{A9}'),
-        ('\u{AE}', '\u{AE}'),
-        ('\u{203C}', '\u{203C}'),
-        ('\u{2049}', '\u{2049}'),
-        ('\u{2122}', '\u{2122}'),
-        ('\u{2139}', '\u{2139}'),
-        ('\u{2194}', '\u{2199}'),
-        ('\u{21A9}', '\u{21AA}'),
-        ('\u{231A}', '\u{231B}'),
-        ('\u{2328}', '\u{2328}'),
-        ('\u{23CF}', '\u{23CF}'),
-        ('\u{23E9}', '\u{23F3}'),
-        ('\u{23F8}', '\u{23FA}'),
-        ('\u{24C2}', '\u{24C2}'),
-        ('\u{25AA}', '\u{25AB}'),
-        ('\u{25B6}', '\u{25B6}'),
-        ('\u{25C0}', '\u{25C0}'),
-        ('\u{25FB}', '\u{25FE}'),
-        ('\u{2600}', '\u{2604}'),
-        ('\u{260E}', '\u{260E}'),
-        ('\u{2611}', '\u{2611}'),
-        ('\u{2614}', '\u{2615}'),
-        ('\u{2618}', '\u{2618}'),
-        ('\u{261D}', '\u{261D}'),
-        ('\u{2620}', '\u{2620}'),
-        ('\u{2622}', '\u{2623}'),
-        ('\u{2626}', '\u{2626}'),
-        ('\u{262A}', '\u{262A}'),
-        ('\u{262E}', '\u{262F}'),
-        ('\u{2638}', '\u{263A}'),
-        ('\u{2640}', '\u{2640}'),
-        ('\u{2642}', '\u{2642}'),
-        ('\u{2648}', '\u{2653}'),
-        ('\u{265F}', '\u{2660}'),
-        ('\u{2663}', '\u{2663}'),
-        ('\u{2665}', '\u{2666}'),
-        ('\u{2668}', '\u{2668}'),
-        ('\u{267B}', '\u{267B}'),
-        ('\u{267E}', '\u{267F}'),
-        ('\u{2692}', '\u{2697}'),
-        ('\u{2699}', '\u{2699}'),
-        ('\u{269B}', '\u{269C}'),
-        ('\u{26A0}', '\u{26A1}'),
-        ('\u{26A7}', '\u{26A7}'),
-        ('\u{26AA}', '\u{26AB}'),
-        ('\u{26B0}', '\u{26B1}'),
-        ('\u{26BD}', '\u{26BE}'),
-        ('\u{26C4}', '\u{26C5}'),
-        ('\u{26C8}', '\u{26C8}'),
-        ('\u{26CE}', '\u{26CF}'),
-        ('\u{26D1}', '\u{26D1}'),
-        ('\u{26D3}', '\u{26D4}'),
-        ('\u{26E9}', '\u{26EA}'),
-        ('\u{26F0}', '\u{26F5}'),
-        ('\u{26F7}', '\u{26FA}'),
-        ('\u{26FD}', '\u{26FD}'),
-        ('\u{2702}', '\u{2702}'),
-        ('\u{2705}', '\u{2705}'),
-        ('\u{2708}', '\u{270D}'),
-        ('\u{270F}', '\u{270F}'),
-        ('\u{2712}', '\u{2712}'),
-        ('\u{2714}', '\u{2714}'),
-        ('\u{2716}', '\u{2716}'),
-        ('\u{271D}', '\u{271D}'),
-        ('\u{2721}', '\u{2721}'),
-        ('\u{2728}', '\u{2728}'),
-        ('\u{2733}', '\u{2734}'),
-        ('\u{2744}', '\u{2744}'),
-        ('\u{2747}', '\u{2747}'),
-        ('\u{274C}', '\u{274C}'),
-        ('\u{274E}', '\u{274E}'),
-        ('\u{2753}', '\u{2755}'),
-        ('\u{2757}', '\u{2757}'),
-        ('\u{2763}', '\u{2764}'),
-        ('\u{2795}', '\u{2797}'),
-        ('\u{27A1}', '\u{27A1}'),
-        ('\u{27B0}', '\u{27B0}'),
-        ('\u{27BF}', '\u{27BF}'),
-        ('\u{2934}', '\u{2935}'),
-        ('\u{2B05}', '\u{2B07}'),
-        ('\u{2B1B}', '\u{2B1C}'),
-        ('\u{2B50}', '\u{2B50}'),
-        ('\u{2B55}', '\u{2B55}'),
-        ('\u{3030}', '\u{3030}'),
-        ('\u{303D}', '\u{303D}'),
-        ('\u{3297}', '\u{3297}'),
-        ('\u{3299}', '\u{3299}'),
-        ('\u{1F004}', '\u{1F004}'),
-        ('\u{1F170}', '\u{1F171}'),
-        ('\u{1F17E}', '\u{1F17F}'),
-        ('\u{1F202}', '\u{1F202}'),
-        ('\u{1F21A}', '\u{1F21A}'),
-        ('\u{1F22F}', '\u{1F22F}'),
-        ('\u{1F237}', '\u{1F237}'),
-        ('\u{1F30D}', '\u{1F30F}'),
-        ('\u{1F315}', '\u{1F315}'),
-        ('\u{1F31C}', '\u{1F31C}'),
-        ('\u{1F321}', '\u{1F321}'),
-        ('\u{1F324}', '\u{1F32C}'),
-        ('\u{1F336}', '\u{1F336}'),
-        ('\u{1F378}', '\u{1F378}'),
-        ('\u{1F37D}', '\u{1F37D}'),
-        ('\u{1F393}', '\u{1F393}'),
-        ('\u{1F396}', '\u{1F397}'),
-        ('\u{1F399}', '\u{1F39B}'),
-        ('\u{1F39E}', '\u{1F39F}'),
-        ('\u{1F3A7}', '\u{1F3A7}'),
-        ('\u{1F3AC}', '\u{1F3AE}'),
-        ('\u{1F3C2}', '\u{1F3C2}'),
-        ('\u{1F3C4}', '\u{1F3C4}'),
-        ('\u{1F3C6}', '\u{1F3C6}'),
-        ('\u{1F3CA}', '\u{1F3CE}'),
-        ('\u{1F3D4}', '\u{1F3E0}'),
-        ('\u{1F3ED}', '\u{1F3ED}'),
-        ('\u{1F3F3}', '\u{1F3F3}'),
-        ('\u{1F3F5}', '\u{1F3F5}'),
-        ('\u{1F3F7}', '\u{1F3F7}'),
-        ('\u{1F408}', '\u{1F408}'),
-        ('\u{1F415}', '\u{1F415}'),
-        ('\u{1F41F}', '\u{1F41F}'),
-        ('\u{1F426}', '\u{1F426}'),
-        ('\u{1F43F}', '\u{1F43F}'),
-        ('\u{1F441}', '\u{1F442}'),
-        ('\u{1F446}', '\u{1F449}'),
-        ('\u{1F44D}', '\u{1F44E}'),
-        ('\u{1F453}', '\u{1F453}'),
-        ('\u{1F46A}', '\u{1F46A}'),
-        ('\u{1F47D}', '\u{1F47D}'),
-        ('\u{1F4A3}', '\u{1F4A3}'),
-        ('\u{1F4B0}', '\u{1F4B0}'),
-        ('\u{1F4B3}', '\u{1F4B3}'),
-        ('\u{1F4BB}', '\u{1F4BB}'),
-        ('\u{1F4BF}', '\u{1F4BF}'),
-        ('\u{1F4CB}', '\u{1F4CB}'),
-        ('\u{1F4DA}', '\u{1F4DA}'),
-        ('\u{1F4DF}', '\u{1F4DF}'),
-        ('\u{1F4E4}', '\u{1F4E6}'),
-        ('\u{1F4EA}', '\u{1F4ED}'),
-        ('\u{1F4F7}', '\u{1F4F7}'),
-        ('\u{1F4F9}', '\u{1F4FB}'),
-        ('\u{1F4FD}', '\u{1F4FD}'),
-        ('\u{1F508}', '\u{1F508}'),
-        ('\u{1F50D}', '\u{1F50D}'),
-        ('\u{1F512}', '\u{1F513}'),
-        ('\u{1F549}', '\u{1F54A}'),
-        ('\u{1F550}', '\u{1F567}'),
-        ('\u{1F56F}', '\u{1F570}'),
-        ('\u{1F573}', '\u{1F579}'),
-        ('\u{1F587}', '\u{1F587}'),
-        ('\u{1F58A}', '\u{1F58D}'),
-        ('\u{1F590}', '\u{1F590}'),
-        ('\u{1F5A5}', '\u{1F5A5}'),
-        ('\u{1F5A8}', '\u{1F5A8}'),
-        ('\u{1F5B1}', '\u{1F5B2}'),
-        ('\u{1F5BC}', '\u{1F5BC}'),
-        ('\u{1F5C2}', '\u{1F5C4}'),
-        ('\u{1F5D1}', '\u{1F5D3}'),
-        ('\u{1F5DC}', '\u{1F5DE}'),
-        ('\u{1F5E1}', '\u{1F5E1}'),
-        ('\u{1F5E3}', '\u{1F5E3}'),
-        ('\u{1F5E8}', '\u{1F5E8}'),
-        ('\u{1F5EF}', '\u{1F5EF}'),
-        ('\u{1F5F3}', '\u{1F5F3}'),
-        ('\u{1F5FA}', '\u{1F5FA}'),
-        ('\u{1F610}', '\u{1F610}'),
-        ('\u{1F687}', '\u{1F687}'),
-        ('\u{1F68D}', '\u{1F68D}'),
-        ('\u{1F691}', '\u{1F691}'),
-        ('\u{1F694}', '\u{1F694}'),
-        ('\u{1F698}', '\u{1F698}'),
-        ('\u{1F6AD}', '\u{1F6AD}'),
-        ('\u{1F6B2}', '\u{1F6B2}'),
-        ('\u{1F6B9}', '\u{1F6BA}'),
-        ('\u{1F6BC}', '\u{1F6BC}'),
-        ('\u{1F6CB}', '\u{1F6CB}'),
-        ('\u{1F6CD}', '\u{1F6CF}'),
-        ('\u{1F6E0}', '\u{1F6E5}'),
-        ('\u{1F6E9}', '\u{1F6E9}'),
-        ('\u{1F6F0}', '\u{1F6F0}'),
-        ('\u{1F6F3}', '\u{1F6F3}'),
+    static EMOJI_PRESENTATION_RANGES: [char; 213] = [
+        '\u{23}',
+        '\u{2A}',
+        '\u{30}',
+        '\u{31}',
+        '\u{32}',
+        '\u{33}',
+        '\u{34}',
+        '\u{35}',
+        '\u{36}',
+        '\u{37}',
+        '\u{38}',
+        '\u{39}',
+        '\u{A9}',
+        '\u{AE}',
+        '\u{203C}',
+        '\u{2049}',
+        '\u{2122}',
+        '\u{2139}',
+        '\u{2194}',
+        '\u{2195}',
+        '\u{2196}',
+        '\u{2197}',
+        '\u{2198}',
+        '\u{2199}',
+        '\u{21A9}',
+        '\u{21AA}',
+        '\u{2328}',
+        '\u{23CF}',
+        '\u{23ED}',
+        '\u{23EE}',
+        '\u{23EF}',
+        '\u{23F1}',
+        '\u{23F2}',
+        '\u{23F8}',
+        '\u{23F9}',
+        '\u{23FA}',
+        '\u{24C2}',
+        '\u{25AA}',
+        '\u{25AB}',
+        '\u{25B6}',
+        '\u{25C0}',
+        '\u{25FB}',
+        '\u{25FC}',
+        '\u{2600}',
+        '\u{2601}',
+        '\u{2602}',
+        '\u{2603}',
+        '\u{2604}',
+        '\u{260E}',
+        '\u{2611}',
+        '\u{2618}',
+        '\u{261D}',
+        '\u{2620}',
+        '\u{2622}',
+        '\u{2623}',
+        '\u{2626}',
+        '\u{262A}',
+        '\u{262E}',
+        '\u{262F}',
+        '\u{2638}',
+        '\u{2639}',
+        '\u{263A}',
+        '\u{2640}',
+        '\u{2642}',
+        '\u{265F}',
+        '\u{2660}',
+        '\u{2663}',
+        '\u{2665}',
+        '\u{2666}',
+        '\u{2668}',
+        '\u{267B}',
+        '\u{267E}',
+        '\u{2692}',
+        '\u{2694}',
+        '\u{2695}',
+        '\u{2696}',
+        '\u{2697}',
+        '\u{2699}',
+        '\u{269B}',
+        '\u{269C}',
+        '\u{26A0}',
+        '\u{26A7}',
+        '\u{26B0}',
+        '\u{26B1}',
+        '\u{26C8}',
+        '\u{26CF}',
+        '\u{26D1}',
+        '\u{26D3}',
+        '\u{26E9}',
+        '\u{26F0}',
+        '\u{26F1}',
+        '\u{26F4}',
+        '\u{26F7}',
+        '\u{26F8}',
+        '\u{26F9}',
+        '\u{2702}',
+        '\u{2708}',
+        '\u{2709}',
+        '\u{270C}',
+        '\u{270D}',
+        '\u{270F}',
+        '\u{2712}',
+        '\u{2714}',
+        '\u{2716}',
+        '\u{271D}',
+        '\u{2721}',
+        '\u{2733}',
+        '\u{2734}',
+        '\u{2744}',
+        '\u{2747}',
+        '\u{2763}',
+        '\u{2764}',
+        '\u{27A1}',
+        '\u{2934}',
+        '\u{2935}',
+        '\u{2B05}',
+        '\u{2B06}',
+        '\u{2B07}',
+        '\u{1F170}',
+        '\u{1F171}',
+        '\u{1F17E}',
+        '\u{1F17F}',
+        '\u{1F321}',
+        '\u{1F324}',
+        '\u{1F325}',
+        '\u{1F326}',
+        '\u{1F327}',
+        '\u{1F328}',
+        '\u{1F329}',
+        '\u{1F32A}',
+        '\u{1F32B}',
+        '\u{1F32C}',
+        '\u{1F336}',
+        '\u{1F37D}',
+        '\u{1F396}',
+        '\u{1F397}',
+        '\u{1F399}',
+        '\u{1F39A}',
+        '\u{1F39B}',
+        '\u{1F39E}',
+        '\u{1F39F}',
+        '\u{1F3CB}',
+        '\u{1F3CC}',
+        '\u{1F3CD}',
+        '\u{1F3CE}',
+        '\u{1F3D4}',
+        '\u{1F3D5}',
+        '\u{1F3D6}',
+        '\u{1F3D7}',
+        '\u{1F3D8}',
+        '\u{1F3D9}',
+        '\u{1F3DA}',
+        '\u{1F3DB}',
+        '\u{1F3DC}',
+        '\u{1F3DD}',
+        '\u{1F3DE}',
+        '\u{1F3DF}',
+        '\u{1F3F3}',
+        '\u{1F3F5}',
+        '\u{1F3F7}',
+        '\u{1F43F}',
+        '\u{1F441}',
+        '\u{1F4FD}',
+        '\u{1F549}',
+        '\u{1F54A}',
+        '\u{1F56F}',
+        '\u{1F570}',
+        '\u{1F573}',
+        '\u{1F574}',
+        '\u{1F575}',
+        '\u{1F576}',
+        '\u{1F577}',
+        '\u{1F578}',
+        '\u{1F579}',
+        '\u{1F587}',
+        '\u{1F58A}',
+        '\u{1F58B}',
+        '\u{1F58C}',
+        '\u{1F58D}',
+        '\u{1F590}',
+        '\u{1F5A5}',
+        '\u{1F5A8}',
+        '\u{1F5B1}',
+        '\u{1F5B2}',
+        '\u{1F5BC}',
+        '\u{1F5C2}',
+        '\u{1F5C3}',
+        '\u{1F5C4}',
+        '\u{1F5D1}',
+        '\u{1F5D2}',
+        '\u{1F5D3}',
+        '\u{1F5DC}',
+        '\u{1F5DD}',
+        '\u{1F5DE}',
+        '\u{1F5E1}',
+        '\u{1F5E3}',
+        '\u{1F5E8}',
+        '\u{1F5EF}',
+        '\u{1F5F3}',
+        '\u{1F5FA}',
+        '\u{1F6CB}',
+        '\u{1F6CD}',
+        '\u{1F6CE}',
+        '\u{1F6CF}',
+        '\u{1F6E0}',
+        '\u{1F6E1}',
+        '\u{1F6E2}',
+        '\u{1F6E3}',
+        '\u{1F6E4}',
+        '\u{1F6E5}',
+        '\u{1F6E9}',
+        '\u{1F6F0}',
+        '\u{1F6F3}',
     ];
 }

From a4d25a96e5a620c381cc64a7c4043874d4c8175f Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Wed, 14 Feb 2024 10:23:51 -0500
Subject: [PATCH 03/13] Use a better datastructure

Faster and smaller!
---
 .github/workflows/rust.yml |   3 +
 Cargo.toml                 |   9 +-
 scripts/unicode.py         | 138 +++++++++++++---
 src/lib.rs                 |   3 +-
 src/tables.rs              | 328 +++++++++++++------------------------
 src/tests.rs               |   7 +
 6 files changed, 247 insertions(+), 241 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 89c5f57..7731d4c 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -22,6 +22,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
     - name: Regen
       run: cd scripts && python3 unicode.py
     - name: Diff
diff --git a/Cargo.toml b/Cargo.toml
index bd8da9c..7c44aa6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,20 +2,23 @@
 
 name = "unicode-width"
 version = "0.1.11"
-authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
-
+authors = [
+    "kwantam <kwantam@gmail.com>",
+    "Manish Goregaokar <manishsmail@gmail.com>",
+]
 homepage = "https://github.com/unicode-rs/unicode-width"
 repository = "https://github.com/unicode-rs/unicode-width"
 documentation = "https://unicode-rs.github.io/unicode-width"
 license = "MIT/Apache-2.0"
 keywords = ["text", "width", "unicode"]
 readme = "README.md"
+edition = "2021"
 description = """
 Determine displayed width of `char` and `str` types
 according to Unicode Standard Annex #11 rules.
 """
 
-exclude = [ "target/*", "Cargo.lock" ]
+exclude = ["target/*", "Cargo.lock"]
 
 [dependencies]
 std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
diff --git a/scripts/unicode.py b/scripts/unicode.py
index 25bbd9e..4195c65 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -23,6 +23,8 @@
 import os
 import re
 import sys
+from collections import defaultdict
+from itertools import batched
 
 NUM_CODEPOINTS = 0x110000
 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
@@ -410,9 +412,9 @@ def make_tables(
     return tables
 
 
-def load_variation_sequences(width_map) -> "list[int]":
+def load_variation_sequences() -> "list[int]":
     """Outputs a list of character ranages, corresponding to all the valid characters for starting
-    an emoji presentation sequence, exclusing those that are always wide."""
+    an emoji presentation sequence."""
 
     with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
         sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
@@ -420,19 +422,68 @@ def load_variation_sequences(width_map) -> "list[int]":
         for line in sequences.readlines():
             if match := sequence.match(line):
                 cp = int(match.group(1), 16)
-                if width_map[cp] == EffectiveWidth.WIDE:
-                    # this character would be width 2 even outside a variation sequence,
-                    # so we don't need to store its info
-                    continue
                 codepoints.append(cp)
     return codepoints
 
 
+def make_variation_sequence_table(
+    seqs: "list[int]",
+    width_map,
+) -> "tuple[list[int], list[list[int]]]":
+    """Generates 2-level look up table for whether a codepoint might start an emoji presentation sequence.
+    (Characters that are always wide may be excluded.)
+    First level maps the most significant byte to a 4-bit index (or 0xFF if can't possibly start such a sequence),
+    second level is a bit array (each leaf is 512 bits long)."""
+    # The structure of the table currently relies on this.
+    # It's unlikely to be a problem in the near future
+    # as this is enough to encompass the entire Basic Multilingual Plane and
+    # Supplementary Multilingual Plane.
+    # And the fix is easy if it ever does become a problem:
+    # just check bits 1 more significant for the index,
+    # and use 1024-bit leaves instead of 512-bit.
+    assert seqs[-1] <= 0x1FFFF
+
+    prefixes_dict = defaultdict(list)
+    for cp in seqs:
+        prefixes_dict[cp >> 9].append(cp & 0x1FF)
+
+    # We don't strictly need to keep track of characters that are always wide,
+    # because being in an emoji variation seq won't affect their width.
+    # So store their info only when it wouldn't inflate the size of the tables.
+    keys = list(prefixes_dict.keys())
+    for k in keys:
+        if all(map(lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, prefixes_dict[k])):
+            del prefixes_dict[k]
+
+    # Another assumption made by the data structure.
+    # Ensures 4 bits are enough to index into subtable
+    assert len(prefixes_dict.keys()) <= 15
+    index_nibbles = [0xF] * 256
+    for idx, k in enumerate(prefixes_dict.keys()):
+        index_nibbles[k] = idx
+
+    index = []
+    for tup in batched(index_nibbles, 2):
+        next = 0
+        for i in range(0, 2):
+            next |= tup[i] << (4 * i)
+        index.append(next)
+
+    leaves = []
+    for leaf_idx, cps in enumerate(prefixes_dict.values()):
+        leaf = [0] * 64
+        for cp in cps:
+            idx_in_leaf, bit_shift = divmod(cp, 8)
+            leaf[idx_in_leaf] |= 1 << bit_shift
+        leaves.append(leaf)
+    return (index, leaves)
+
+
 def emit_module(
     out_name: str,
     unicode_version: "tuple[int, int, int]",
     tables: "list[Table]",
-    emoji_variations: "list[int]",
+    variation_table: "tuple[list[int], list[list[int]]]",
 ):
     """Outputs a Rust module to `out_name` using table data from `tables`.
     If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
@@ -509,16 +560,33 @@ def emit_module(
 """
         )
 
+        variation_idx, variation_leaves = variation_table
+
         module.write(
-            """
+            f"""
     /// Whether this character forms an [emoji presentation sequence]
     /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
-    /// when followed by `'\\u{FEOF}'`.
+    /// when followed by `'\\u{{FEOF}}'`.
     /// Emoji presentation sequences are considered to have width 2.
+    /// This may spuriously return `false` for all characters that are always wide.
     #[inline]
-    pub fn starts_emoji_presentation_seq(c: char) -> bool {
-        EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok()
-    }
+    pub fn starts_emoji_presentation_seq(c: char) -> bool {{
+        let cp: u32 = c.into();
+        let Ok(top_byte): Result<u8, _> = ((cp) >> 9).try_into() else {{
+            return false;
+        }};
+
+        let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)];
+        let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF;
+        if index_nibble >= {len(variation_leaves)} {{
+            return false;
+        }}
+
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)]
+            [usize::try_from((cp >> 3) & 0x3F).unwrap()];
+
+        ((leaf_byte >> (cp & 7)) & 1) == 1
+    }}
 """
         )
 
@@ -575,15 +643,36 @@ def emit_module(
 
         module.write(
             f"""
-    /// Each tuple corresponds to a range (inclusive at both ends)
-    /// of characters that can start an emoji presentation sequence.
-    static EMOJI_PRESENTATION_RANGES: [char; {len(emoji_variations)}] = [
+    /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0)
+    /// of the char you want to test. 0xF means it's not part of a presentation seq,
+    /// anything else means index into the next table.
+    static EMOJI_PRESENTATION_INDEX: [u8; {len(variation_idx)}] = [
 """
         )
-        for cp in emoji_variations:
-            module.write(f"        '\\u{{{cp:X}}}',\n")
+        for row in batched(variation_idx, 15):
+            module.write("       ")
+            for idx in row:
+                module.write(f" 0x{idx:02X},")
+            module.write("\n")
         module.write("    ];\n")
 
+        module.write(
+            f"""
+    /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
+    /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq.
+    static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; {len(variation_leaves)}] = [
+"""
+        )
+        for leaf in variation_leaves:
+            module.write("        [\n")
+            for row in batched(leaf, 14):
+                module.write("           ")
+                for entry in row:
+                    module.write(f" 0x{entry:02X},")
+                module.write("\n")
+            module.write("        ],\n")
+
+        module.write("    ];\n")
         module.write("}\n")
 
 
@@ -593,6 +682,7 @@ def main(module_filename: str):
     `module_filename`.
 
     We obey the following rules in decreasing order of importance:
+    - Emoji presentation sequences are double-width.
     - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
     - Hangul jamo medial vowels & final consonants are zero-width.
     - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
@@ -619,18 +709,26 @@ def main(module_filename: str):
     width_map[0x00AD] = EffectiveWidth.NARROW
 
     tables = make_tables(TABLE_CFGS, enumerate(width_map))
-    emoji_variations = load_variation_sequences(width_map)
+
+    emoji_variations = load_variation_sequences()
+    variation_table = make_variation_sequence_table(emoji_variations, width_map)
 
     print("------------------------")
     total_size = 0
     for i, table in enumerate(tables):
         size_bytes = len(table.to_bytes())
-        print(f"Table {i} Size: {size_bytes} bytes")
+        print(f"Table {i} size: {size_bytes} bytes")
         total_size += size_bytes
+    emoji_index_size = len(variation_table[0])
+    print(f"Emoji Presentation Index Size: {emoji_index_size} bytes")
+    total_size += emoji_index_size
+    emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
+    print(f"Emoji Presentation Leaves Size: {emoji_leaves_size} bytes")
+    total_size += emoji_leaves_size
     print("------------------------")
     print(f"  Total Size: {total_size} bytes")
 
-    emit_module(module_filename, version, tables, emoji_variations)
+    emit_module(module_filename, version, tables, variation_table)
     print(f'Wrote to "{module_filename}"')
 
 
diff --git a/src/lib.rs b/src/lib.rs
index aec3b74..45d97e0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -42,7 +42,8 @@
 //! unicode-width = "0.1.5"
 //! ```
 
-#![deny(missing_docs, unsafe_code)]
+#![forbid(unsafe_code)]
+#![deny(missing_docs)]
 #![doc(
     html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
     html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
diff --git a/src/tables.rs b/src/tables.rs
index 1f92bdb..26da4be 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -61,9 +61,24 @@ pub mod charwidth {
     /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
     /// when followed by `'\u{FEOF}'`.
     /// Emoji presentation sequences are considered to have width 2.
+    /// This may spuriously return `false` for all characters that are always wide.
     #[inline]
     pub fn starts_emoji_presentation_seq(c: char) -> bool {
-        EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok()
+        let cp: u32 = c.into();
+        let Ok(top_byte): Result<u8, _> = ((cp) >> 9).try_into() else {
+            return false;
+        };
+
+        let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)];
+        let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF;
+        if index_nibble >= 11 {
+            return false;
+        }
+
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)]
+            [usize::try_from((cp >> 3) & 0x3F).unwrap()];
+
+        ((leaf_byte >> (cp & 7)) & 1) == 1
     }
 
     /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
@@ -548,221 +563,100 @@ pub mod charwidth {
         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F,
     ];
 
-    /// Each tuple corresponds to a range (inclusive at both ends)
-    /// of characters that can start an emoji presentation sequence.
-    static EMOJI_PRESENTATION_RANGES: [char; 213] = [
-        '\u{23}',
-        '\u{2A}',
-        '\u{30}',
-        '\u{31}',
-        '\u{32}',
-        '\u{33}',
-        '\u{34}',
-        '\u{35}',
-        '\u{36}',
-        '\u{37}',
-        '\u{38}',
-        '\u{39}',
-        '\u{A9}',
-        '\u{AE}',
-        '\u{203C}',
-        '\u{2049}',
-        '\u{2122}',
-        '\u{2139}',
-        '\u{2194}',
-        '\u{2195}',
-        '\u{2196}',
-        '\u{2197}',
-        '\u{2198}',
-        '\u{2199}',
-        '\u{21A9}',
-        '\u{21AA}',
-        '\u{2328}',
-        '\u{23CF}',
-        '\u{23ED}',
-        '\u{23EE}',
-        '\u{23EF}',
-        '\u{23F1}',
-        '\u{23F2}',
-        '\u{23F8}',
-        '\u{23F9}',
-        '\u{23FA}',
-        '\u{24C2}',
-        '\u{25AA}',
-        '\u{25AB}',
-        '\u{25B6}',
-        '\u{25C0}',
-        '\u{25FB}',
-        '\u{25FC}',
-        '\u{2600}',
-        '\u{2601}',
-        '\u{2602}',
-        '\u{2603}',
-        '\u{2604}',
-        '\u{260E}',
-        '\u{2611}',
-        '\u{2618}',
-        '\u{261D}',
-        '\u{2620}',
-        '\u{2622}',
-        '\u{2623}',
-        '\u{2626}',
-        '\u{262A}',
-        '\u{262E}',
-        '\u{262F}',
-        '\u{2638}',
-        '\u{2639}',
-        '\u{263A}',
-        '\u{2640}',
-        '\u{2642}',
-        '\u{265F}',
-        '\u{2660}',
-        '\u{2663}',
-        '\u{2665}',
-        '\u{2666}',
-        '\u{2668}',
-        '\u{267B}',
-        '\u{267E}',
-        '\u{2692}',
-        '\u{2694}',
-        '\u{2695}',
-        '\u{2696}',
-        '\u{2697}',
-        '\u{2699}',
-        '\u{269B}',
-        '\u{269C}',
-        '\u{26A0}',
-        '\u{26A7}',
-        '\u{26B0}',
-        '\u{26B1}',
-        '\u{26C8}',
-        '\u{26CF}',
-        '\u{26D1}',
-        '\u{26D3}',
-        '\u{26E9}',
-        '\u{26F0}',
-        '\u{26F1}',
-        '\u{26F4}',
-        '\u{26F7}',
-        '\u{26F8}',
-        '\u{26F9}',
-        '\u{2702}',
-        '\u{2708}',
-        '\u{2709}',
-        '\u{270C}',
-        '\u{270D}',
-        '\u{270F}',
-        '\u{2712}',
-        '\u{2714}',
-        '\u{2716}',
-        '\u{271D}',
-        '\u{2721}',
-        '\u{2733}',
-        '\u{2734}',
-        '\u{2744}',
-        '\u{2747}',
-        '\u{2763}',
-        '\u{2764}',
-        '\u{27A1}',
-        '\u{2934}',
-        '\u{2935}',
-        '\u{2B05}',
-        '\u{2B06}',
-        '\u{2B07}',
-        '\u{1F170}',
-        '\u{1F171}',
-        '\u{1F17E}',
-        '\u{1F17F}',
-        '\u{1F321}',
-        '\u{1F324}',
-        '\u{1F325}',
-        '\u{1F326}',
-        '\u{1F327}',
-        '\u{1F328}',
-        '\u{1F329}',
-        '\u{1F32A}',
-        '\u{1F32B}',
-        '\u{1F32C}',
-        '\u{1F336}',
-        '\u{1F37D}',
-        '\u{1F396}',
-        '\u{1F397}',
-        '\u{1F399}',
-        '\u{1F39A}',
-        '\u{1F39B}',
-        '\u{1F39E}',
-        '\u{1F39F}',
-        '\u{1F3CB}',
-        '\u{1F3CC}',
-        '\u{1F3CD}',
-        '\u{1F3CE}',
-        '\u{1F3D4}',
-        '\u{1F3D5}',
-        '\u{1F3D6}',
-        '\u{1F3D7}',
-        '\u{1F3D8}',
-        '\u{1F3D9}',
-        '\u{1F3DA}',
-        '\u{1F3DB}',
-        '\u{1F3DC}',
-        '\u{1F3DD}',
-        '\u{1F3DE}',
-        '\u{1F3DF}',
-        '\u{1F3F3}',
-        '\u{1F3F5}',
-        '\u{1F3F7}',
-        '\u{1F43F}',
-        '\u{1F441}',
-        '\u{1F4FD}',
-        '\u{1F549}',
-        '\u{1F54A}',
-        '\u{1F56F}',
-        '\u{1F570}',
-        '\u{1F573}',
-        '\u{1F574}',
-        '\u{1F575}',
-        '\u{1F576}',
-        '\u{1F577}',
-        '\u{1F578}',
-        '\u{1F579}',
-        '\u{1F587}',
-        '\u{1F58A}',
-        '\u{1F58B}',
-        '\u{1F58C}',
-        '\u{1F58D}',
-        '\u{1F590}',
-        '\u{1F5A5}',
-        '\u{1F5A8}',
-        '\u{1F5B1}',
-        '\u{1F5B2}',
-        '\u{1F5BC}',
-        '\u{1F5C2}',
-        '\u{1F5C3}',
-        '\u{1F5C4}',
-        '\u{1F5D1}',
-        '\u{1F5D2}',
-        '\u{1F5D3}',
-        '\u{1F5DC}',
-        '\u{1F5DD}',
-        '\u{1F5DE}',
-        '\u{1F5E1}',
-        '\u{1F5E3}',
-        '\u{1F5E8}',
-        '\u{1F5EF}',
-        '\u{1F5F3}',
-        '\u{1F5FA}',
-        '\u{1F6CB}',
-        '\u{1F6CD}',
-        '\u{1F6CE}',
-        '\u{1F6CF}',
-        '\u{1F6E0}',
-        '\u{1F6E1}',
-        '\u{1F6E2}',
-        '\u{1F6E3}',
-        '\u{1F6E4}',
-        '\u{1F6E5}',
-        '\u{1F6E9}',
-        '\u{1F6F0}',
-        '\u{1F6F3}',
+    /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0)
+    /// of the char you want to test. 0xF means it's not part of a presentation seq,
+    /// anything else means index into the next table.
+    static EMOJI_PRESENTATION_INDEX: [u8; 128] = [
+        0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x21, 0x43, 0x65, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0xA9, 0xFF, 0xFF,
+    ];
+
+    /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
+    /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq.
+    static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; 11] = [
+        [
+            0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        [
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x02, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x03, 0x00, 0x06, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        [
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, 0x0F, 0x07,
+        ],
+        [
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x40, 0x00,
+            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78,
+        ],
+        [
+            0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4, 0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01,
+            0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A, 0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00,
+            0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF, 0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50,
+            0xB8, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        [
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        [
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        [
+            0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        [
+            0x04, 0x00, 0x00, 0x04, 0x00, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00,
+            0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, 0xA8, 0x00,
+        ],
+        [
+            0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04,
+            0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84,
+            0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
+            0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10,
+            0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04,
+        ],
+        [
+            0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x80, 0x20, 0x12, 0x01, 0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00,
+            0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
     ];
 }
diff --git a/src/tests.rs b/src/tests.rs
index e8f6686..33815c8 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -285,4 +285,11 @@ fn test_emoji_presentation() {
     assert_eq!(UnicodeWidthStr::width("a\u{0023}\u{FE0F}a"), 4);
     assert_eq!(UnicodeWidthStr::width("\u{0023}a\u{FE0F}"), 2);
     assert_eq!(UnicodeWidthStr::width("a\u{FE0F}"), 1);
+    assert_eq!(UnicodeWidthStr::width("\u{0023}\u{0023}\u{FE0F}a"), 4);
+
+    assert_eq!(UnicodeWidthStr::width("\u{002A}\u{FE0F}"), 2);
+    assert_eq!(UnicodeWidthStr::width("\u{23F9}\u{FE0F}"), 2);
+    assert_eq!(UnicodeWidthStr::width("\u{24C2}\u{FE0F}"), 2);
+    assert_eq!(UnicodeWidthStr::width("\u{1F6F3}\u{FE0F}"), 2);
+    assert_eq!(UnicodeWidthStr::width("\u{1F700}\u{FE0F}"), 1);
 }

From 51a8417472f9805384a5c27534a62252cc9415ec Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Wed, 14 Feb 2024 11:41:31 -0500
Subject: [PATCH 04/13] Document exact width rules

---
 .github/workflows/rust.yml |  6 ++++++
 src/lib.rs                 | 41 ++++++++++++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 7731d4c..eef84a9 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -7,7 +7,11 @@ on:
     branches: [ "master" ]
 
 env:
+  CARGO_INCREMENTAL: 0
   CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: 1
+  RUSTFLAGS: -D warnings
+  RUSTDOCFLAGS: -D warnings
 
 jobs:
   build:
@@ -18,6 +22,8 @@ jobs:
       run: cargo build --verbose
     - name: Run tests
       run: cargo test --verbose
+    - name: Build docs
+      run: cargo doc
   regen:
     runs-on: ubuntu-latest
     steps:
diff --git a/src/lib.rs b/src/lib.rs
index 45d97e0..4e3813a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,8 +9,11 @@
 // except according to those terms.
 
 //! Determine displayed width of `char` and `str` types according to
-//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
-//! rules.
+//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/),
+//! other portions of the Unicode standard, and common implementations of
+//! POSIX [`wcwidth()`](https://pubs.opengroup.org/onlinepubs/9699919799/).
+//! See the [Rules for determining width](#rules-for-determining-width) section
+//! for the exact rules.
 //!
 //! ```rust
 //! extern crate unicode_width;
@@ -41,6 +44,34 @@
 //! [dependencies]
 //! unicode-width = "0.1.5"
 //! ```
+//! # Rules for determining width
+//!
+//! This crate currently uses the following rules to determine the width of a
+//! character or string, in order of decreasing precedence. These may be tweaked in the future.
+//!
+//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+//!    have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.)
+//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
+//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
+//! 4. The following have width 0:
+//!    1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AHangul_Syllable_Type%253D%252FV%7CT%252F%253A%5D)
+//!       with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
+//!       of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`),
+//!    2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253ADefault_Ignorable_Code_Point%253DYes%253A%5D)
+//!       with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property,
+//!    3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253D%252FMn%7CMe%252F%253A%5D)
+//!       with a [`General_Category`](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142)
+//!       of `Nonspacing_Mark` (`Mn`) or `Enclosing_Mark` (`Me`), and
+//!    4. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
+//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253DCc%253A%5D)
+//!    have no defined width, and are considered to have width 0 when contained within a string.
+//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253D%252FF%7CW%252F%253A%5D)
+//!    with an [`East_Asian_Width`](https://www.unicode.org/reports/tr11/#ED1) of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
+//!    or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
+//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253DA%253A%5D)
+//!    with an `East_Asian_Width` of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
+//!    have width 2 in an East Asian context, and width 1 otherwise.
+//! 8. All other characters have width 1.
 
 #![forbid(unsafe_code)]
 #![deny(missing_docs)]
@@ -110,8 +141,7 @@ pub trait UnicodeWidthStr {
     /// as 1 column wide. This is consistent with the recommendations for
     /// non-CJK contexts, or when the context cannot be reliably determined.
     ///
-    /// Also consistent with UAX11, this function treats [emoji presentation sequences]
-    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// Also consistent with UAX11, this function treats [emoji presentation sequences](https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
     /// as 2 columns wide. This means that the width of a string may not equal
     /// the sum of the widths of its individual characters.
     fn width(&self) -> usize;
@@ -125,8 +155,7 @@ pub trait UnicodeWidthStr {
     /// as 2 column wide. This is consistent with the recommendations for
     /// CJK contexts.
     ///
-    /// Also consistent with UAX11, this function treats [emoji presentation sequences]
-    /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
+    /// Also consistent with UAX11, this function treats [emoji presentation sequences](https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
     /// as 2 columns wide. This means that the width of a string may not equal
     /// the sum of the widths of its individual characters.
     fn width_cjk(&self) -> usize;

From 5d8bc25dffec73d2bae2bd3ef0276578634d53f2 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Wed, 14 Feb 2024 12:18:36 -0500
Subject: [PATCH 05/13] Add more CI checks

---
 .github/workflows/rust.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index eef84a9..7f2c9e9 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -24,6 +24,10 @@ jobs:
       run: cargo test --verbose
     - name: Build docs
       run: cargo doc
+    - name: Check formatting
+      run: cargo fmt --check
+    - name: Check clippy
+      run: cargo clippy
   regen:
     runs-on: ubuntu-latest
     steps:

From 6beb76f328c6f6eb01a659a4bda9a447a71716b0 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Thu, 15 Feb 2024 19:07:42 -0500
Subject: [PATCH 06/13] Add emoji benchmark

---
 .gitignore   |  1 +
 src/tests.rs | 50 ++++++++++++++++++++------------------------------
 2 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2d7d550..12e0bd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ Cargo.lock
 scripts/tmp
 scripts/*.txt
 scripts/*.rs
+bench_data/*
diff --git a/src/tests.rs b/src/tests.rs
index 33815c8..5b22a60 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -9,13 +9,12 @@
 // except according to those terms.
 
 #[cfg(feature = "bench")]
-use super::{UnicodeWidthChar, UnicodeWidthStr};
-#[cfg(feature = "bench")]
-use std::iter;
+use std::{iter, string::String};
+
 #[cfg(feature = "bench")]
-use test::Bencher;
+use test::{self, Bencher};
 
-use std::prelude::v1::*;
+use super::{UnicodeWidthChar, UnicodeWidthStr};
 
 #[cfg(feature = "bench")]
 #[bench]
@@ -95,6 +94,7 @@ fn simple_width_match(c: char) -> Option<usize> {
 }
 #[cfg(feature = "bench")]
 #[bench]
+
 fn enwik8(b: &mut Bencher) {
     // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip
     let data_path = "bench_data/enwik8";
@@ -103,13 +103,25 @@ fn enwik8(b: &mut Bencher) {
 }
 #[cfg(feature = "bench")]
 #[bench]
+
 fn jawiki(b: &mut Bencher) {
-    // To benchmark, download & extract `jawiki-20220501-pages-articles-multistream-index.txt` from
-    // https://dumps.wikimedia.org/jawiki/20220501/jawiki-20220501-pages-articles-multistream-index.txt.bz2
-    let data_path = "bench_data/jawiki-20220501-pages-articles-multistream-index.txt";
+    // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from
+    // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2
+    let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt";
     let string = std::fs::read_to_string(data_path).unwrap_or_default();
     b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
 }
+
+#[cfg(feature = "bench")]
+#[bench]
+
+fn emoji(b: &mut Bencher) {
+    // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
+    let data_path = "bench_data/emoji-style.txt";
+    let string = std::fs::read_to_string(data_path).unwrap_or_default();
+    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
+}
+
 #[test]
 fn test_str() {
     use super::UnicodeWidthStr;
@@ -130,8 +142,6 @@ fn test_str() {
 #[test]
 fn test_emoji() {
     // Example from the README.
-    use super::UnicodeWidthStr;
-
     assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman
     assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope
     assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist
@@ -139,8 +149,6 @@ fn test_emoji() {
 
 #[test]
 fn test_char() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('ｈ'), Some(2));
     assert_eq!('ｈ'.width_cjk(), Some(2));
     assert_eq!(UnicodeWidthChar::width('\x00'), Some(0));
@@ -153,8 +161,6 @@ fn test_char() {
 
 #[test]
 fn test_char2() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\x00'), Some(0));
     assert_eq!('\x00'.width_cjk(), Some(0));
 
@@ -182,15 +188,11 @@ fn test_char2() {
 
 #[test]
 fn unicode_12() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\u{1F971}'), Some(2));
 }
 
 #[test]
 fn test_default_ignorable() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\u{E0000}'), Some(0));
 
     assert_eq!(UnicodeWidthChar::width('\u{1160}'), Some(0));
@@ -200,8 +202,6 @@ fn test_default_ignorable() {
 
 #[test]
 fn test_jamo() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\u{1100}'), Some(2));
     assert_eq!(UnicodeWidthChar::width('\u{A97C}'), Some(2));
     // Special case: U+115F HANGUL CHOSEONG FILLER
@@ -214,8 +214,6 @@ fn test_jamo() {
 
 #[test]
 fn test_prepended_concatenation_marks() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\u{0600}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{070F}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{08E2}'), Some(1));
@@ -224,8 +222,6 @@ fn test_prepended_concatenation_marks() {
 
 #[test]
 fn test_interlinear_annotation_chars() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\u{FFF9}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{FFFA}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{FFFB}'), Some(1));
@@ -233,8 +229,6 @@ fn test_interlinear_annotation_chars() {
 
 #[test]
 fn test_hieroglyph_format_controls() {
-    use super::UnicodeWidthChar;
-
     assert_eq!(UnicodeWidthChar::width('\u{13430}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{13436}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{1343C}'), Some(1));
@@ -275,10 +269,6 @@ fn test_canonical_equivalence() {
 
 #[test]
 fn test_emoji_presentation() {
-    use super::{UnicodeWidthChar, UnicodeWidthStr};
-    #[cfg(feature = "no_std")]
-    use core::option::Option::Some;
-
     assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1));
     assert_eq!(UnicodeWidthChar::width('\u{FE0F}'), Some(0));
     assert_eq!(UnicodeWidthStr::width("\u{0023}\u{FE0F}"), 2);

From ad55481fa7e4f528b75a4b3ac81540ebf1517e5a Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Tue, 5 Mar 2024 12:33:17 -0500
Subject: [PATCH 07/13] Address review comments

---
 scripts/unicode.py | 36 ++++++++++++++++++++++++++++--------
 src/tables.rs      | 23 ++++++++++++++++++-----
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index 4195c65..4ad2139 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -69,7 +69,7 @@ def fetch_open(filename: str):
     fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
     """
     basename = os.path.basename(filename)
-    if not os.path.exists(os.path.basename(filename)):
+    if not os.path.exists(basename):
         os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
     try:
         return open(basename, encoding="utf-8")
@@ -417,6 +417,8 @@ def load_variation_sequences() -> "list[int]":
     an emoji presentation sequence."""
 
     with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
+        # Match all emoji presentation sequences
+        # (one codepoint followed by U+FE0F, and labeled "emoji style")
         sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
         codepoints = []
         for line in sequences.readlines():
@@ -452,7 +454,12 @@ def make_variation_sequence_table(
     # So store their info only when it wouldn't inflate the size of the tables.
     keys = list(prefixes_dict.keys())
     for k in keys:
-        if all(map(lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, prefixes_dict[k])):
+        if all(
+            map(
+                lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE,
+                prefixes_dict[k],
+            )
+        ):
             del prefixes_dict[k]
 
     # Another assumption made by the data structure.
@@ -470,7 +477,7 @@ def make_variation_sequence_table(
         index.append(next)
 
     leaves = []
-    for leaf_idx, cps in enumerate(prefixes_dict.values()):
+    for cps in prefixes_dict.values():
         leaf = [0] * 64
         for cp in cps:
             idx_in_leaf, bit_shift = divmod(cp, 8)
@@ -572,19 +579,32 @@ def emit_module(
     #[inline]
     pub fn starts_emoji_presentation_seq(c: char) -> bool {{
         let cp: u32 = c.into();
-        let Ok(top_byte): Result<u8, _> = ((cp) >> 9).try_into() else {{
+
+        // The largest codepoint for which this function returns `true`
+        // has 17 significant bits. Extract the most significant 8 of these,
+        // or return `false` if `cp` is outside this range.
+        let Ok(top_byte): Result<u8, _> = (cp >> 9).try_into() else {{
             return false;
         }};
 
+        // Use the byte from above to obtain the corresponding 4-bit index
+        // from the indexes table.
         let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)];
         let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF;
-        if index_nibble >= {len(variation_leaves)} {{
+
+        // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed)
+        // equal to `top_byte` can change width when part of an emoji presentation seq,
+        // so return `false`.
+        let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else {{
             return false;
-        }}
+        }};
 
-        let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)]
-            [usize::try_from((cp >> 3) & 0x3F).unwrap()];
+        // Extract the 3-8th (0-indexed) least significant bits of `cp`,
+        // and use them to index into `leaf_row`.
+        let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap();
+        let leaf_byte = leaf_row[leaf_row_idx];
 
+        // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
     }}
 """
diff --git a/src/tables.rs b/src/tables.rs
index 26da4be..83e974c 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -65,19 +65,32 @@ pub mod charwidth {
     #[inline]
     pub fn starts_emoji_presentation_seq(c: char) -> bool {
         let cp: u32 = c.into();
-        let Ok(top_byte): Result<u8, _> = ((cp) >> 9).try_into() else {
+
+        // The largest codepoint for which this function returns `true`
+        // has 17 significant bits. Extract the most significant 8 of these,
+        // or return `false` if `cp` is outside this range.
+        let Ok(top_byte): Result<u8, _> = (cp >> 9).try_into() else {
             return false;
         };
 
+        // Use the byte from above to obtain the corresponding 4-bit index
+        // from the indexes table.
         let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)];
         let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF;
-        if index_nibble >= 11 {
+
+        // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed)
+        // equal to `top_byte` can change width when part of an emoji presentation seq,
+        // so return `false`.
+        let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else {
             return false;
-        }
+        };
 
-        let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)]
-            [usize::try_from((cp >> 3) & 0x3F).unwrap()];
+        // Extract the 3-8th (0-indexed) least significant bits of `cp`,
+        // and use them to index into `leaf_row`.
+        let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap();
+        let leaf_byte = leaf_row[leaf_row_idx];
 
+        // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
     }
 

From 4f80b57dd6c1b57e783f29fb1b43cfbd7be463a0 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Tue, 5 Mar 2024 13:29:22 -0500
Subject: [PATCH 08/13] Use `match` instead of array for first level of tree

---
 scripts/unicode.py |  92 ++++++++++-------------------------
 src/tables.rs      | 119 ++++++++++++++++++---------------------------
 2 files changed, 71 insertions(+), 140 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index 4ad2139..fafe1a8 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -432,22 +432,13 @@ def make_variation_sequence_table(
     seqs: "list[int]",
     width_map,
 ) -> "tuple[list[int], list[list[int]]]":
-    """Generates 2-level look up table for whether a codepoint might start an emoji presentation sequence.
+    """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
     (Characters that are always wide may be excluded.)
-    First level maps the most significant byte to a 4-bit index (or 0xFF if can't possibly start such a sequence),
-    second level is a bit array (each leaf is 512 bits long)."""
-    # The structure of the table currently relies on this.
-    # It's unlikely to be a problem in the near future
-    # as this is enough to encompass the entire Basic Multilingual Plane and
-    # Supplementary Multilingual Plane.
-    # And the fix is easy if it ever does become a problem:
-    # just check bits 1 more significant for the index,
-    # and use 1024-bit leaves instead of 512-bit.
-    assert seqs[-1] <= 0x1FFFF
+    The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB."""
 
     prefixes_dict = defaultdict(list)
     for cp in seqs:
-        prefixes_dict[cp >> 9].append(cp & 0x1FF)
+        prefixes_dict[cp >> 10].append(cp & 0x3FF)
 
     # We don't strictly need to keep track of characters that are always wide,
     # because being in an emoji variation seq won't affect their width.
@@ -456,34 +447,22 @@ def make_variation_sequence_table(
     for k in keys:
         if all(
             map(
-                lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE,
+                lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
                 prefixes_dict[k],
             )
         ):
             del prefixes_dict[k]
 
-    # Another assumption made by the data structure.
-    # Ensures 4 bits are enough to index into subtable
-    assert len(prefixes_dict.keys()) <= 15
-    index_nibbles = [0xF] * 256
-    for idx, k in enumerate(prefixes_dict.keys()):
-        index_nibbles[k] = idx
-
-    index = []
-    for tup in batched(index_nibbles, 2):
-        next = 0
-        for i in range(0, 2):
-            next |= tup[i] << (4 * i)
-        index.append(next)
+    print(prefixes_dict)
 
     leaves = []
     for cps in prefixes_dict.values():
-        leaf = [0] * 64
+        leaf = [0] * 128
         for cp in cps:
             idx_in_leaf, bit_shift = divmod(cp, 8)
             leaf[idx_in_leaf] |= 1 << bit_shift
         leaves.append(leaf)
-    return (index, leaves)
+    return (list(prefixes_dict.keys()), leaves)
 
 
 def emit_module(
@@ -580,29 +559,23 @@ def emit_module(
     pub fn starts_emoji_presentation_seq(c: char) -> bool {{
         let cp: u32 = c.into();
 
-        // The largest codepoint for which this function returns `true`
-        // has 17 significant bits. Extract the most significant 8 of these,
-        // or return `false` if `cp` is outside this range.
-        let Ok(top_byte): Result<u8, _> = (cp >> 9).try_into() else {{
-            return false;
-        }};
+        // First level of lookup uses all but 10 LSB
+        let top_bits = cp >> 10;
+        let idx_of_leaf: usize = match top_bits {{
+"""
+        )
 
-        // Use the byte from above to obtain the corresponding 4-bit index
-        // from the indexes table.
-        let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)];
-        let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF;
+        for i, msbs in enumerate(variation_idx):
+            module.write(f"            {msbs} => {i},\n")
 
-        // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed)
-        // equal to `top_byte` can change width when part of an emoji presentation seq,
-        // so return `false`.
-        let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else {{
-            return false;
+        module.write(
+            f"""            _ => return false,
         }};
 
-        // Extract the 3-8th (0-indexed) least significant bits of `cp`,
+        // Extract the 3-9th (0-indexed) least significant bits of `cp`,
         // and use them to index into `leaf_row`.
-        let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap();
-        let leaf_byte = leaf_row[leaf_row_idx];
+        let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf];
 
         // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
@@ -663,24 +636,9 @@ def emit_module(
 
         module.write(
             f"""
-    /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0)
-    /// of the char you want to test. 0xF means it's not part of a presentation seq,
-    /// anything else means index into the next table.
-    static EMOJI_PRESENTATION_INDEX: [u8; {len(variation_idx)}] = [
-"""
-        )
-        for row in batched(variation_idx, 15):
-            module.write("       ")
-            for idx in row:
-                module.write(f" 0x{idx:02X},")
-            module.write("\n")
-        module.write("    ];\n")
-
-        module.write(
-            f"""
-    /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
-    /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq.
-    static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; {len(variation_leaves)}] = [
+    /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
+    /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
+    static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; {len(variation_leaves)}] = [
 """
         )
         for leaf in variation_leaves:
@@ -739,11 +697,11 @@ def main(module_filename: str):
         size_bytes = len(table.to_bytes())
         print(f"Table {i} size: {size_bytes} bytes")
         total_size += size_bytes
-    emoji_index_size = len(variation_table[0])
-    print(f"Emoji Presentation Index Size: {emoji_index_size} bytes")
+    emoji_index_size = len(variation_table[0]) * 4
+    print(f"Emoji presentation index size: {emoji_index_size} bytes")
     total_size += emoji_index_size
     emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
-    print(f"Emoji Presentation Leaves Size: {emoji_leaves_size} bytes")
+    print(f"Emoji presentation leaves Size: {emoji_leaves_size} bytes")
     total_size += emoji_leaves_size
     print("------------------------")
     print(f"  Total Size: {total_size} bytes")
diff --git a/src/tables.rs b/src/tables.rs
index 83e974c..db89a7d 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -66,29 +66,22 @@ pub mod charwidth {
     pub fn starts_emoji_presentation_seq(c: char) -> bool {
         let cp: u32 = c.into();
 
-        // The largest codepoint for which this function returns `true`
-        // has 17 significant bits. Extract the most significant 8 of these,
-        // or return `false` if `cp` is outside this range.
-        let Ok(top_byte): Result<u8, _> = (cp >> 9).try_into() else {
-            return false;
+        // First level of lookup uses all but 10 LSB
+        let top_bits = cp >> 10;
+        let idx_of_leaf: usize = match top_bits {
+            0 => 0,
+            8 => 1,
+            9 => 2,
+            10 => 3,
+            124 => 4,
+            125 => 5,
+            _ => return false,
         };
 
-        // Use the byte from above to obtain the corresponding 4-bit index
-        // from the indexes table.
-        let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)];
-        let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF;
-
-        // If the index is the 0xF sentinel, then no codepoint with bits 9-16 (0 indexed)
-        // equal to `top_byte` can change width when part of an emoji presentation seq,
-        // so return `false`.
-        let Some(leaf_row) = EMOJI_PRESENTATION_LEAVES.get(usize::from(index_nibble)) else {
-            return false;
-        };
-
-        // Extract the 3-8th (0-indexed) least significant bits of `cp`,
+        // Extract the 3-9th (0-indexed) least significant bits of `cp`,
         // and use them to index into `leaf_row`.
-        let leaf_row_idx = usize::try_from((cp >> 3) & 0x3F).unwrap();
-        let leaf_byte = leaf_row[leaf_row_idx];
+        let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf];
 
         // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
@@ -576,100 +569,80 @@ pub mod charwidth {
         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F,
     ];
 
-    /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0)
-    /// of the char you want to test. 0xF means it's not part of a presentation seq,
-    /// anything else means index into the next table.
-    static EMOJI_PRESENTATION_INDEX: [u8; 128] = [
-        0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x21, 0x43, 0x65, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-        0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0xA9, 0xFF, 0xFF,
-    ];
-
-    /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
-    /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq.
-    static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; 11] = [
+    /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
+    /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
+    static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; 6] = [
         [
             0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00,
         ],
         [
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x02, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x03, 0x00, 0x06, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ],
-        [
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, 0x0F, 0x07,
+            0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE,
+            0x0F, 0x07,
         ],
         [
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x40, 0x00,
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78,
-        ],
-        [
-            0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4, 0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01,
-            0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A, 0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00,
-            0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF, 0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50,
-            0xB8, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4,
+            0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01, 0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A,
+            0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00, 0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF,
+            0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50, 0xB8, 0x00, 0x18, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00,
         ],
         [
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ],
-        [
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00,
+            0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00,
         ],
         [
             0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ],
-        [
-            0x04, 0x00, 0x00, 0x04, 0x00, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00,
-            0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20, 0xA8, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x80,
+            0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0,
+            0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21,
+            0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20,
+            0xA8, 0x00,
         ],
         [
             0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04,
             0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84,
             0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
             0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10,
-            0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04,
-        ],
-        [
-            0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x80, 0x20, 0x12, 0x01, 0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00,
-            0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x20, 0x12, 0x01,
+            0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00,
         ],
     ];
 }

From d944bdd3e7f1b1973a4442c0a1c4559b93905c4a Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Tue, 5 Mar 2024 13:48:07 -0500
Subject: [PATCH 09/13] Spuriously treat certain always-wide characters as
 eligible for emoji presentation

---
 scripts/unicode.py | 41 ++++++++++++++++++++++++-----------------
 src/tables.rs      | 38 +++++++++++++++++++-------------------
 2 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index fafe1a8..00b1aea 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -434,17 +434,17 @@ def make_variation_sequence_table(
 ) -> "tuple[list[int], list[list[int]]]":
     """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
     (Characters that are always wide may be excluded.)
-    The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB."""
+    The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
+    """
 
-    prefixes_dict = defaultdict(list)
+    prefixes_dict = defaultdict(set)
     for cp in seqs:
-        prefixes_dict[cp >> 10].append(cp & 0x3FF)
+        prefixes_dict[cp >> 10].add(cp & 0x3FF)
 
     # We don't strictly need to keep track of characters that are always wide,
     # because being in an emoji variation seq won't affect their width.
     # So store their info only when it wouldn't inflate the size of the tables.
-    keys = list(prefixes_dict.keys())
-    for k in keys:
+    for k in list(prefixes_dict.keys()):
         if all(
             map(
                 lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
@@ -453,7 +453,14 @@ def make_variation_sequence_table(
         ):
             del prefixes_dict[k]
 
-    print(prefixes_dict)
+    indexes = list(prefixes_dict.keys())
+
+    # Similarly, we can spuriously return `true` for always-wide characters
+    # even if not part of a presentation seq; this saves an additional lookup,
+    # so we should do it where there is no size cost.
+    for cp, width in enumerate(width_map):
+        if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
+            prefixes_dict[cp >> 10].add(cp & 0x3FF)
 
     leaves = []
     for cps in prefixes_dict.values():
@@ -462,7 +469,7 @@ def make_variation_sequence_table(
             idx_in_leaf, bit_shift = divmod(cp, 8)
             leaf[idx_in_leaf] |= 1 << bit_shift
         leaves.append(leaf)
-    return (list(prefixes_dict.keys()), leaves)
+    return (indexes, leaves)
 
 
 def emit_module(
@@ -549,19 +556,19 @@ def emit_module(
         variation_idx, variation_leaves = variation_table
 
         module.write(
-            f"""
+            """
     /// Whether this character forms an [emoji presentation sequence]
     /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
-    /// when followed by `'\\u{{FEOF}}'`.
+    /// when followed by `'\\u{FEOF}'`.
     /// Emoji presentation sequences are considered to have width 2.
-    /// This may spuriously return `false` for all characters that are always wide.
+    /// This may spuriously return `true` or `false` for characters that are always wide.
     #[inline]
-    pub fn starts_emoji_presentation_seq(c: char) -> bool {{
+    pub fn starts_emoji_presentation_seq(c: char) -> bool {
         let cp: u32 = c.into();
 
         // First level of lookup uses all but 10 LSB
         let top_bits = cp >> 10;
-        let idx_of_leaf: usize = match top_bits {{
+        let idx_of_leaf: usize = match top_bits {
 """
         )
 
@@ -569,8 +576,8 @@ def emit_module(
             module.write(f"            {msbs} => {i},\n")
 
         module.write(
-            f"""            _ => return false,
-        }};
+            """            _ => return false,
+        };
 
         // Extract the 3-9th (0-indexed) least significant bits of `cp`,
         // and use them to index into `leaf_row`.
@@ -579,7 +586,7 @@ def emit_module(
 
         // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
-    }}
+    }
 """
         )
 
@@ -701,10 +708,10 @@ def main(module_filename: str):
     print(f"Emoji presentation index size: {emoji_index_size} bytes")
     total_size += emoji_index_size
     emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
-    print(f"Emoji presentation leaves Size: {emoji_leaves_size} bytes")
+    print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
     total_size += emoji_leaves_size
     print("------------------------")
-    print(f"  Total Size: {total_size} bytes")
+    print(f"  Total size: {total_size} bytes")
 
     emit_module(module_filename, version, tables, variation_table)
     print(f'Wrote to "{module_filename}"')
diff --git a/src/tables.rs b/src/tables.rs
index db89a7d..7e1c899 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -61,7 +61,7 @@ pub mod charwidth {
     /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
     /// when followed by `'\u{FEOF}'`.
     /// Emoji presentation sequences are considered to have width 2.
-    /// This may spuriously return `false` for all characters that are always wide.
+    /// This may spuriously return `true` or `false` for characters that are always wide.
     #[inline]
     pub fn starts_emoji_presentation_seq(c: char) -> bool {
         let cp: u32 = c.into();
@@ -592,7 +592,7 @@ pub mod charwidth {
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x0C, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE,
             0x0F, 0x07,
         ],
@@ -622,27 +622,27 @@ pub mod charwidth {
         ],
         [
             0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x80,
-            0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0,
-            0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21,
-            0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20,
-            0xA8, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x40, 0xFE, 0x07, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0x0F, 0xFF, 0x01, 0x03, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xCF, 0xCE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xB9, 0xFF,
         ],
         [
-            0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04,
-            0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84,
-            0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
-            0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10,
-            0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x20, 0x12, 0x01,
-            0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00,
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xBF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0x00, 0x7E,
+            0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x07, 0x80, 0x3C, 0x61, 0x00, 0x30, 0x01, 0x06, 0x10,
+            0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0xF8, 0xE7, 0xF0, 0x3F, 0x1A, 0xF9, 0x1F, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x0F,
+            0x01, 0x00,
         ],
     ];
 }

From a8b2fabb8e3ccc14f0abadff0c447a09c78a3ff9 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Tue, 5 Mar 2024 15:31:40 -0500
Subject: [PATCH 10/13] Align `EMOJI_PRESENTATION_LEAVES` to 128 bytes

Ensure rows don't cross cache lines, makes a small difference in the benchmarks
---
 scripts/unicode.py | 9 ++++++---
 src/tables.rs      | 9 ++++++---
 src/tests.rs       | 2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index 00b1aea..ed17229 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -582,7 +582,7 @@ def emit_module(
         // Extract the 3-9th (0-indexed) least significant bits of `cp`,
         // and use them to index into `leaf_row`.
         let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
-        let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf];
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
 
         // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
@@ -643,9 +643,12 @@ def emit_module(
 
         module.write(
             f"""
+    #[repr(align(128))]
+    struct Align128<T>(T);
+
     /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
     /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
-    static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; {len(variation_leaves)}] = [
+    static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
 """
         )
         for leaf in variation_leaves:
@@ -657,7 +660,7 @@ def emit_module(
                 module.write("\n")
             module.write("        ],\n")
 
-        module.write("    ];\n")
+        module.write("    ]);\n")
         module.write("}\n")
 
 
diff --git a/src/tables.rs b/src/tables.rs
index 7e1c899..fa76684 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -81,7 +81,7 @@ pub mod charwidth {
         // Extract the 3-9th (0-indexed) least significant bits of `cp`,
         // and use them to index into `leaf_row`.
         let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
-        let leaf_byte = EMOJI_PRESENTATION_LEAVES[idx_of_leaf][idx_within_leaf];
+        let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
 
         // Use the 3 LSB of `cp` to index into `leaf_byte`.
         ((leaf_byte >> (cp & 7)) & 1) == 1
@@ -569,9 +569,12 @@ pub mod charwidth {
         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F,
     ];
 
+    #[repr(align(128))]
+    struct Align128<T>(T);
+
     /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
     /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
-    static EMOJI_PRESENTATION_LEAVES: [[u8; 128]; 6] = [
+    static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; 6]> = Align128([
         [
             0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -644,5 +647,5 @@ pub mod charwidth {
             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x0F,
             0x01, 0x00,
         ],
-    ];
+    ]);
 }
diff --git a/src/tests.rs b/src/tests.rs
index 5b22a60..676068f 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -12,7 +12,7 @@
 use std::{iter, string::String};
 
 #[cfg(feature = "bench")]
-use test::{self, Bencher};
+use test::Bencher;
 
 use super::{UnicodeWidthChar, UnicodeWidthStr};
 

From a5066aaffee62fb116101869d713844a053ae302 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 22 Apr 2024 15:34:31 -0400
Subject: [PATCH 11/13] Convert tests into integration tests

---
 .github/workflows/rust.yml |   2 +-
 Cargo.toml                 |   1 -
 benches/benches.rs         | 113 ++++++++++++++++++++++++++++++++++
 src/lib.rs                 |  11 ----
 {src => tests}/tests.rs    | 120 +------------------------------------
 5 files changed, 115 insertions(+), 132 deletions(-)
 create mode 100644 benches/benches.rs
 rename {src => tests}/tests.rs (65%)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 7f2c9e9..c0908cb 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -27,7 +27,7 @@ jobs:
     - name: Check formatting
       run: cargo fmt --check
     - name: Check clippy
-      run: cargo clippy
+      run: cargo clippy --lib --tests
   regen:
     runs-on: ubuntu-latest
     steps:
diff --git a/Cargo.toml b/Cargo.toml
index 7c44aa6..49e7539 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,6 @@ unicode-normalization = "0.1.23"
 
 [features]
 default = []
-bench = []
 rustc-dep-of-std = ['std', 'core', 'compiler_builtins']
 
 # Legacy, now a no-op
diff --git a/benches/benches.rs b/benches/benches.rs
new file mode 100644
index 0000000..c91cef4
--- /dev/null
+++ b/benches/benches.rs
@@ -0,0 +1,113 @@
+// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+#![feature(test)]
+
+extern crate test;
+
+use std::iter;
+
+use test::Bencher;
+
+use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
+
+#[bench]
+fn cargo(b: &mut Bencher) {
+    let string = iter::repeat('a').take(4096).collect::<String>();
+
+    b.iter(|| {
+        for c in string.chars() {
+            test::black_box(UnicodeWidthChar::width(c));
+        }
+    });
+}
+
+#[bench]
+fn stdlib(b: &mut Bencher) {
+    let string = iter::repeat('a').take(4096).collect::<String>();
+
+    b.iter(|| {
+        for c in string.chars() {
+            test::black_box(c.width());
+        }
+    });
+}
+
+#[bench]
+fn simple_if(b: &mut Bencher) {
+    let string = iter::repeat('a').take(4096).collect::<String>();
+
+    b.iter(|| {
+        for c in string.chars() {
+            test::black_box(simple_width_if(c));
+        }
+    });
+}
+
+#[bench]
+fn simple_match(b: &mut Bencher) {
+    let string = iter::repeat('a').take(4096).collect::<String>();
+
+    b.iter(|| {
+        for c in string.chars() {
+            test::black_box(simple_width_match(c));
+        }
+    });
+}
+
+#[inline]
+fn simple_width_if(c: char) -> Option<usize> {
+    let cu = c as u32;
+    if cu < 127 {
+        if cu > 31 {
+            Some(1)
+        } else if cu == 0 {
+            Some(0)
+        } else {
+            None
+        }
+    } else {
+        UnicodeWidthChar::width(c)
+    }
+}
+
+#[inline]
+fn simple_width_match(c: char) -> Option<usize> {
+    match c as u32 {
+        cu if cu == 0 => Some(0),
+        cu if cu < 0x20 => None,
+        cu if cu < 0x7f => Some(1),
+        _ => UnicodeWidthChar::width(c),
+    }
+}
+
+#[bench]
+fn enwik8(b: &mut Bencher) {
+    // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip
+    let data_path = "bench_data/enwik8";
+    let string = std::fs::read_to_string(data_path).unwrap_or_default();
+    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
+}
+
+#[bench]
+fn jawiki(b: &mut Bencher) {
+    // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from
+    // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2
+    let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt";
+    let string = std::fs::read_to_string(data_path).unwrap_or_default();
+    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
+}
+
+#[bench]
+fn emoji(b: &mut Bencher) {
+    // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
+    let data_path = "bench_data/emoji-style.txt";
+    let string = std::fs::read_to_string(data_path).unwrap_or_default();
+    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
+}
diff --git a/src/lib.rs b/src/lib.rs
index 4e3813a..1ca6bb5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -79,24 +79,13 @@
     html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
     html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
 )]
-#![cfg_attr(feature = "bench", feature(test))]
 #![no_std]
 
-#[cfg(test)]
-#[macro_use]
-extern crate std;
-
-#[cfg(feature = "bench")]
-extern crate test;
-
 use tables::charwidth as cw;
 pub use tables::UNICODE_VERSION;
 
 mod tables;
 
-#[cfg(test)]
-mod tests;
-
 /// Methods for determining displayed width of Unicode characters.
 pub trait UnicodeWidthChar {
     /// Returns the character's displayed width in columns, or `None` if the
diff --git a/src/tests.rs b/tests/tests.rs
similarity index 65%
rename from src/tests.rs
rename to tests/tests.rs
index 676068f..47218e4 100644
--- a/src/tests.rs
+++ b/tests/tests.rs
@@ -8,124 +8,10 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-#[cfg(feature = "bench")]
-use std::{iter, string::String};
-
-#[cfg(feature = "bench")]
-use test::Bencher;
-
-use super::{UnicodeWidthChar, UnicodeWidthStr};
-
-#[cfg(feature = "bench")]
-#[bench]
-fn cargo(b: &mut Bencher) {
-    let string = iter::repeat('a').take(4096).collect::<String>();
-
-    b.iter(|| {
-        for c in string.chars() {
-            test::black_box(UnicodeWidthChar::width(c));
-        }
-    });
-}
-
-#[cfg(feature = "bench")]
-#[bench]
-#[allow(deprecated)]
-fn stdlib(b: &mut Bencher) {
-    let string = iter::repeat('a').take(4096).collect::<String>();
-
-    b.iter(|| {
-        for c in string.chars() {
-            test::black_box(c.width());
-        }
-    });
-}
-
-#[cfg(feature = "bench")]
-#[bench]
-fn simple_if(b: &mut Bencher) {
-    let string = iter::repeat('a').take(4096).collect::<String>();
-
-    b.iter(|| {
-        for c in string.chars() {
-            test::black_box(simple_width_if(c));
-        }
-    });
-}
-
-#[cfg(feature = "bench")]
-#[bench]
-fn simple_match(b: &mut Bencher) {
-    let string = iter::repeat('a').take(4096).collect::<String>();
-
-    b.iter(|| {
-        for c in string.chars() {
-            test::black_box(simple_width_match(c));
-        }
-    });
-}
-
-#[cfg(feature = "bench")]
-#[inline]
-fn simple_width_if(c: char) -> Option<usize> {
-    let cu = c as u32;
-    if cu < 127 {
-        if cu > 31 {
-            Some(1)
-        } else if cu == 0 {
-            Some(0)
-        } else {
-            None
-        }
-    } else {
-        UnicodeWidthChar::width(c)
-    }
-}
-
-#[cfg(feature = "bench")]
-#[inline]
-fn simple_width_match(c: char) -> Option<usize> {
-    match c as u32 {
-        cu if cu == 0 => Some(0),
-        cu if cu < 0x20 => None,
-        cu if cu < 0x7f => Some(1),
-        _ => UnicodeWidthChar::width(c),
-    }
-}
-#[cfg(feature = "bench")]
-#[bench]
-
-fn enwik8(b: &mut Bencher) {
-    // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip
-    let data_path = "bench_data/enwik8";
-    let string = std::fs::read_to_string(data_path).unwrap_or_default();
-    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
-}
-#[cfg(feature = "bench")]
-#[bench]
-
-fn jawiki(b: &mut Bencher) {
-    // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from
-    // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2
-    let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt";
-    let string = std::fs::read_to_string(data_path).unwrap_or_default();
-    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
-}
-
-#[cfg(feature = "bench")]
-#[bench]
-
-fn emoji(b: &mut Bencher) {
-    // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt
-    let data_path = "bench_data/emoji-style.txt";
-    let string = std::fs::read_to_string(data_path).unwrap_or_default();
-    b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str())));
-}
+use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
 
 #[test]
 fn test_str() {
-    use super::UnicodeWidthStr;
-
     assert_eq!(UnicodeWidthStr::width("ｈｅｌｌｏ"), 10);
     assert_eq!("ｈｅｌｌｏ".width_cjk(), 10);
     assert_eq!(UnicodeWidthStr::width("\0\0\0\x01\x01"), 0);
@@ -236,8 +122,6 @@ fn test_hieroglyph_format_controls() {
 
 #[test]
 fn test_marks() {
-    use super::UnicodeWidthChar;
-
     // Nonspacing marks have 0 width
     assert_eq!(UnicodeWidthChar::width('\u{0301}'), Some(0));
     // Enclosing marks have 0 width
@@ -250,8 +134,6 @@ fn test_marks() {
 
 #[test]
 fn test_canonical_equivalence() {
-    use super::{UnicodeWidthChar, UnicodeWidthStr};
-
     for c in '\0'..='\u{10FFFF}' {
         let mut nfd = String::new();
         unicode_normalization::char::decompose_canonical(c, |d| nfd.push(d));

From 5e8bf9b882c2a52c77ca0d841d73f45370048982 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 22 Apr 2024 16:01:07 -0400
Subject: [PATCH 12/13] Update docs to mention `Grapheme_Extend`

---
 src/lib.rs | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 1ca6bb5..d952880 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -54,21 +54,28 @@
 //! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1.
 //! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
 //! 4. The following have width 0:
-//!    1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AHangul_Syllable_Type%253D%252FV%7CT%252F%253A%5D)
+//!    1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
 //!       with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593)
 //!       of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`),
-//!    2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253ADefault_Ignorable_Code_Point%253DYes%253A%5D)
+//!    2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
 //!       with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property,
-//!    3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253D%252FMn%7CMe%252F%253A%5D)
-//!       with a [`General_Category`](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142)
-//!       of `Nonspacing_Mark` (`Mn`) or `Enclosing_Mark` (`Me`), and
-//!    4. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
-//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AGeneral_Category%253DCc%253A%5D)
+//!    3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
+//!       with the [`Grapheme_Extend`](https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf#G52443) property,
+//!    4. [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0),
+//!       [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7),
+//!       [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8),
+//!       [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA),
+//!       [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB),
+//!       [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B),
+//!       [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and
+//!       [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43),
+//!    5. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
+//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
 //!    have no defined width, and are considered to have width 0 when contained within a string.
-//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253D%252FF%7CW%252F%253A%5D)
+//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DW%7D%5Cp%7BEast_Asian_Width%3DF%7D)
 //!    with an [`East_Asian_Width`](https://www.unicode.org/reports/tr11/#ED1) of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2)
 //!    or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2.
-//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AEast_Asian_Width%253DA%253A%5D)
+//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
 //!    with an `East_Asian_Width` of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6)
 //!    have width 2 in an East Asian context, and width 1 otherwise.
 //! 8. All other characters have width 1.

From 46a60670d7307e5b0a6108c319a4d399c460b74a Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 22 Apr 2024 16:07:41 -0400
Subject: [PATCH 13/13] Update unicode.py commendt to match new rules

---
 scripts/unicode.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index ed17229..b50d40f 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -674,7 +674,9 @@ def main(module_filename: str):
     - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c)
     - Hangul jamo medial vowels & final consonants are zero-width.
     - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER.
-    - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width.
+    - Control characters are zero-width.
+    - `Grapheme_Extend` characters, as well as eight spacing marks that canonically decompose to `Grapheme_Extend` characters,
+      are zero-width.
     - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
     - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
     - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width