diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 89c5f57..c0908cb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -7,7 +7,11 @@ on: branches: [ "master" ] env: + CARGO_INCREMENTAL: 0 CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + RUSTFLAGS: -D warnings + RUSTDOCFLAGS: -D warnings jobs: build: @@ -18,10 +22,19 @@ jobs: run: cargo build --verbose - name: Run tests run: cargo test --verbose + - name: Build docs + run: cargo doc + - name: Check formatting + run: cargo fmt --check + - name: Check clippy + run: cargo clippy --lib --tests regen: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' - name: Regen run: cd scripts && python3 unicode.py - name: Diff diff --git a/.gitignore b/.gitignore index 2d7d550..12e0bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ Cargo.lock scripts/tmp scripts/*.txt scripts/*.rs +bench_data/* diff --git a/Cargo.toml b/Cargo.toml index bd8da9c..49e7539 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,20 +2,23 @@ name = "unicode-width" version = "0.1.11" -authors = ["kwantam ", "Manish Goregaokar "] - +authors = [ + "kwantam ", + "Manish Goregaokar ", +] homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" documentation = "https://unicode-rs.github.io/unicode-width" license = "MIT/Apache-2.0" keywords = ["text", "width", "unicode"] readme = "README.md" +edition = "2021" description = """ Determine displayed width of `char` and `str` types according to Unicode Standard Annex #11 rules. """ -exclude = [ "target/*", "Cargo.lock" ] +exclude = ["target/*", "Cargo.lock"] [dependencies] std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } @@ -27,7 +30,6 @@ unicode-normalization = "0.1.23" [features] default = [] -bench = [] rustc-dep-of-std = ['std', 'core', 'compiler_builtins'] # Legacy, now a no-op diff --git a/benches/benches.rs b/benches/benches.rs new file mode 100644 index 0000000..c91cef4 --- /dev/null +++ b/benches/benches.rs @@ -0,0 +1,113 @@ +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +#![feature(test)] + +extern crate test; + +use std::iter; + +use test::Bencher; + +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; + +#[bench] +fn cargo(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(UnicodeWidthChar::width(c)); + } + }); +} + +#[bench] +fn stdlib(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(c.width()); + } + }); +} + +#[bench] +fn simple_if(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(simple_width_if(c)); + } + }); +} + +#[bench] +fn simple_match(b: &mut Bencher) { + let string = iter::repeat('a').take(4096).collect::(); + + b.iter(|| { + for c in string.chars() { + test::black_box(simple_width_match(c)); + } + }); +} + +#[inline] +fn simple_width_if(c: char) -> Option { + let cu = c as u32; + if cu < 127 { + if cu > 31 { + Some(1) + } else if cu == 0 { + Some(0) + } else { + None + } + } else { + UnicodeWidthChar::width(c) + } +} + +#[inline] +fn simple_width_match(c: char) -> Option { + match c as u32 { + cu if cu == 0 => Some(0), + cu if cu < 0x20 => None, + cu if cu < 0x7f => Some(1), + _ => UnicodeWidthChar::width(c), + } +} + +#[bench] +fn enwik8(b: &mut Bencher) { + // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip + let data_path = "bench_data/enwik8"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} + +#[bench] +fn jawiki(b: &mut Bencher) { + // To benchmark, download & extract `jawiki-20240201-pages-articles-multistream-index.txt` from + // https://dumps.wikimedia.org/jawiki/20240201/jawiki-20240201-pages-articles-multistream-index.txt.bz2 + let data_path = "bench_data/jawiki-20240201-pages-articles-multistream-index.txt"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} + +#[bench] +fn emoji(b: &mut Bencher) { + // To benchmark, download emoji-style.txt from https://www.unicode.org/emoji/charts/emoji-style.txt + let data_path = "bench_data/emoji-style.txt"; + let string = std::fs::read_to_string(data_path).unwrap_or_default(); + b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); +} diff --git a/scripts/unicode.py b/scripts/unicode.py index e91f001..b50d40f 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -23,6 +23,8 @@ import os import re import sys +from collections import defaultdict +from itertools import batched NUM_CODEPOINTS = 0x110000 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" @@ -66,12 +68,13 @@ def fetch_open(filename: str): """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure. """ - if not os.path.exists(os.path.basename(filename)): + basename = os.path.basename(filename) + if not os.path.exists(basename): os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") try: - return open(filename, encoding="utf-8") + return open(basename, encoding="utf-8") except OSError: - sys.stderr.write(f"cannot load {filename}") + sys.stderr.write(f"cannot load {basename}") sys.exit(1) @@ -152,7 +155,8 @@ def load_zero_widths() -> "list[bool]": - it is in general category `Cc`, - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), - - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`), + - or if it is one of U+0CC0, U+0CC7, U+0CC8, U+0CCA, U+0CCB, U+1B3B, U+1B3D, or U+1B43, + - or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`) and is not U+115F, - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`). """ @@ -408,8 +412,71 @@ def make_tables( return tables +def load_variation_sequences() -> "list[int]": + """Outputs a list of character ranages, corresponding to all the valid characters for starting + an emoji presentation sequence.""" + + with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: + # Match all emoji presentation sequences + # (one codepoint followed by U+FE0F, and labeled "emoji style") + sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") + codepoints = [] + for line in sequences.readlines(): + if match := sequence.match(line): + cp = int(match.group(1), 16) + codepoints.append(cp) + return codepoints + + +def make_variation_sequence_table( + seqs: "list[int]", + width_map, +) -> "tuple[list[int], list[list[int]]]": + """Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence. + (Characters that are always wide may be excluded.) + The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB. + """ + + prefixes_dict = defaultdict(set) + for cp in seqs: + prefixes_dict[cp >> 10].add(cp & 0x3FF) + + # We don't strictly need to keep track of characters that are always wide, + # because being in an emoji variation seq won't affect their width. + # So store their info only when it wouldn't inflate the size of the tables. + for k in list(prefixes_dict.keys()): + if all( + map( + lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE, + prefixes_dict[k], + ) + ): + del prefixes_dict[k] + + indexes = list(prefixes_dict.keys()) + + # Similarly, we can spuriously return `true` for always-wide characters + # even if not part of a presentation seq; this saves an additional lookup, + # so we should do it where there is no size cost. + for cp, width in enumerate(width_map): + if width == EffectiveWidth.WIDE and (cp >> 10) in indexes: + prefixes_dict[cp >> 10].add(cp & 0x3FF) + + leaves = [] + for cps in prefixes_dict.values(): + leaf = [0] * 128 + for cp in cps: + idx_in_leaf, bit_shift = divmod(cp, 8) + leaf[idx_in_leaf] |= 1 << bit_shift + leaves.append(leaf) + return (indexes, leaves) + + def emit_module( - out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" + out_name: str, + unicode_version: "tuple[int, int, int]", + tables: "list[Table]", + variation_table: "tuple[list[int], list[list[int]]]", ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. @@ -486,6 +553,43 @@ def emit_module( """ ) + variation_idx, variation_leaves = variation_table + + module.write( + """ + /// Whether this character forms an [emoji presentation sequence] + /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// when followed by `'\\u{FEOF}'`. + /// Emoji presentation sequences are considered to have width 2. + /// This may spuriously return `true` or `false` for characters that are always wide. + #[inline] + pub fn starts_emoji_presentation_seq(c: char) -> bool { + let cp: u32 = c.into(); + + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits { +""" + ) + + for i, msbs in enumerate(variation_idx): + module.write(f" {msbs} => {i},\n") + + module.write( + """ _ => return false, + }; + + // Extract the 3-9th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; + + // Use the 3 LSB of `cp` to index into `leaf_byte`. + ((leaf_byte >> (cp & 7)) & 1) == 1 + } +""" + ) + module.write( """ /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or @@ -534,6 +638,29 @@ def emit_module( module.write(f" 0x{byte:02X},") module.write("\n ];\n") subtable_count = new_subtable_count + + # emoji table + + module.write( + f""" + #[repr(align(128))] + struct Align128(T); + + /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([ +""" + ) + for leaf in variation_leaves: + module.write(" [\n") + for row in batched(leaf, 14): + module.write(" ") + for entry in row: + module.write(f" 0x{entry:02X},") + module.write("\n") + module.write(" ],\n") + + module.write(" ]);\n") module.write("}\n") @@ -543,10 +670,13 @@ def main(module_filename: str): `module_filename`. We obey the following rules in decreasing order of importance: + - Emoji presentation sequences are double-width. - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c) - Hangul jamo medial vowels & final consonants are zero-width. - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. - - All codepoints in general categories `Cc`, `Mn`, or `Me` are zero-width. + - Control characters are zero-width. + - `Grapheme_Extend` characters, as well as eight spacing marks that canonically decompose to `Grapheme_Extend` characters, + are zero-width. - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width @@ -570,16 +700,25 @@ def main(module_filename: str): tables = make_tables(TABLE_CFGS, enumerate(width_map)) + emoji_variations = load_variation_sequences() + variation_table = make_variation_sequence_table(emoji_variations, width_map) + print("------------------------") total_size = 0 for i, table in enumerate(tables): size_bytes = len(table.to_bytes()) - print(f"Table {i} Size: {size_bytes} bytes") + print(f"Table {i} size: {size_bytes} bytes") total_size += size_bytes + emoji_index_size = len(variation_table[0]) * 4 + print(f"Emoji presentation index size: {emoji_index_size} bytes") + total_size += emoji_index_size + emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0]) + print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes") + total_size += emoji_leaves_size print("------------------------") - print(f" Total Size: {total_size} bytes") + print(f" Total size: {total_size} bytes") - emit_module(module_filename, version, tables) + emit_module(module_filename, version, tables, variation_table) print(f'Wrote to "{module_filename}"') diff --git a/src/lib.rs b/src/lib.rs index 2f22613..d952880 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,11 @@ // except according to those terms. //! Determine displayed width of `char` and `str` types according to -//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) -//! rules. +//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/), +//! other portions of the Unicode standard, and common implementations of +//! POSIX [`wcwidth()`](https://pubs.opengroup.org/onlinepubs/9699919799/). +//! See the [Rules for determining width](#rules-for-determining-width) section +//! for the exact rules. //! //! ```rust //! extern crate unicode_width; @@ -41,30 +44,55 @@ //! [dependencies] //! unicode-width = "0.1.5" //! ``` +//! # Rules for determining width +//! +//! This crate currently uses the following rules to determine the width of a +//! character or string, in order of decreasing precedence. These may be tweaked in the future. +//! +//! 1. [Emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence) +//! have width 2. (The width of a string may therefore differ from the sum of the widths of its characters.) +//! 2. [`'\u{00AD}'` SOFT HYPHEN](https://util.unicode.org/UnicodeJsps/character.jsp?a=00AD) has width 1. +//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. +//! 4. The following have width 0: +//! 1. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D) +//! with a [`Hangul_Syllable_Type`](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593) +//! of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`), +//! 2. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) +//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property, +//! 3. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) +//! with the [`Grapheme_Extend`](https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf#G52443) property, +//! 4. [`'\u{0CC0}'` KANNADA VOWEL SIGN II](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC0), +//! [`'\u{0CC7}'` KANNADA VOWEL SIGN EE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC7), +//! [`'\u{0CC8}'` KANNADA VOWEL SIGN AI](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CC8), +//! [`'\u{0CCA}'` KANNADA VOWEL SIGN O](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCA), +//! [`'\u{0CCB}'` KANNADA VOWEL SIGN OO](https://util.unicode.org/UnicodeJsps/character.jsp?a=0CCB), +//! [`'\u{1B3B}'` BALINESE VOWEL SIGN RA REPA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3B), +//! [`'\u{1B3D}'` BALINESE VOWEL SIGN LA LENGA TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B3D), and +//! [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43), +//! 5. [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000). +//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D) +//! have no defined width, and are considered to have width 0 when contained within a string. +//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DW%7D%5Cp%7BEast_Asian_Width%3DF%7D) +//! with an [`East_Asian_Width`](https://www.unicode.org/reports/tr11/#ED1) of [`Fullwidth` (`F`)](https://www.unicode.org/reports/tr11/#ED2) +//! or [`Wide` (`W`)](https://www.unicode.org/reports/tr11/#ED4) have width 2. +//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D) +//! with an `East_Asian_Width` of [`Ambiguous` (`A`)](https://www.unicode.org/reports/tr11/#ED6) +//! have width 2 in an East Asian context, and width 1 otherwise. +//! 8. All other characters have width 1. -#![deny(missing_docs, unsafe_code)] +#![forbid(unsafe_code)] +#![deny(missing_docs)] #![doc( html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" )] -#![cfg_attr(feature = "bench", feature(test))] #![no_std] -#[cfg(test)] -#[macro_use] -extern crate std; - -#[cfg(feature = "bench")] -extern crate test; - use tables::charwidth as cw; pub use tables::UNICODE_VERSION; mod tables; -#[cfg(test)] -mod tests; - /// Methods for determining displayed width of Unicode characters. pub trait UnicodeWidthChar { /// Returns the character's displayed width in columns, or `None` if the @@ -108,6 +136,10 @@ pub trait UnicodeWidthStr { /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) /// as 1 column wide. This is consistent with the recommendations for /// non-CJK contexts, or when the context cannot be reliably determined. + /// + /// Also consistent with UAX11, this function treats [emoji presentation sequences](https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// as 2 columns wide. This means that the width of a string may not equal + /// the sum of the widths of its individual characters. fn width(&self) -> usize; /// Returns the string's displayed width in columns. @@ -118,17 +150,38 @@ pub trait UnicodeWidthStr { /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) /// as 2 column wide. This is consistent with the recommendations for /// CJK contexts. + /// + /// Also consistent with UAX11, this function treats [emoji presentation sequences](https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// as 2 columns wide. This means that the width of a string may not equal + /// the sum of the widths of its individual characters. fn width_cjk(&self) -> usize; } impl UnicodeWidthStr for str { #[inline] fn width(&self) -> usize { - self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum() + str_width(self, false) } #[inline] fn width_cjk(&self) -> usize { - self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum() + str_width(self, true) } } + +fn str_width(s: &str, is_cjk: bool) -> usize { + s.chars() + .rfold((0, false), |(sum, was_fe0f), c| { + if c == '\u{FE0F}' { + (sum, true) + } else { + let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) { + 2 + } else { + cw::width(c, is_cjk).unwrap_or(0) + }; + (sum + add, false) + } + }) + .0 +} diff --git a/src/tables.rs b/src/tables.rs index 8e2e9eb..fa76684 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -57,6 +57,36 @@ pub mod charwidth { } } + /// Whether this character forms an [emoji presentation sequence] + /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) + /// when followed by `'\u{FEOF}'`. + /// Emoji presentation sequences are considered to have width 2. + /// This may spuriously return `true` or `false` for characters that are always wide. + #[inline] + pub fn starts_emoji_presentation_seq(c: char) -> bool { + let cp: u32 = c.into(); + + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits { + 0 => 0, + 8 => 1, + 9 => 2, + 10 => 3, + 124 => 4, + 125 => 5, + _ => return false, + }; + + // Extract the 3-9th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; + + // Use the 3 LSB of `cp` to index into `leaf_byte`. + ((leaf_byte >> (cp & 7)) & 1) == 1 + } + /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or /// `None` if `c` is a control character other than `'\x00'`. /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, @@ -538,4 +568,84 @@ pub mod charwidth { 0x55, 0xAA, 0xAA, 0x56, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, ]; + + #[repr(align(128))] + struct Align128(T); + + /// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; 6]> = Align128([ + [ + 0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x03, 0x00, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x0C, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE, + 0x0F, 0x07, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x40, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x78, 0x1F, 0x40, 0x32, 0x21, 0x4D, 0xC4, + 0x00, 0x07, 0x05, 0xFF, 0x0F, 0x80, 0x69, 0x01, 0x00, 0xC8, 0x00, 0x00, 0xFC, 0x1A, + 0x83, 0x0C, 0x03, 0x60, 0x30, 0xC1, 0x1A, 0x00, 0x00, 0x06, 0xBF, 0x27, 0x24, 0xBF, + 0x54, 0x20, 0x02, 0x01, 0x18, 0x00, 0x90, 0x50, 0xB8, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xE0, 0x00, 0x02, 0x00, 0x01, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + ], + [ + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x40, 0xFE, 0x07, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x0F, 0xFF, 0x01, 0x03, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, + 0xFF, 0xFF, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xCF, 0xCE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xB9, 0xFF, + ], + [ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xBF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0x00, 0x7E, + 0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x07, 0x80, 0x3C, 0x61, 0x00, 0x30, 0x01, 0x06, 0x10, + 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0xF8, 0xE7, 0xF0, 0x3F, 0x1A, 0xF9, 0x1F, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x0F, + 0x01, 0x00, + ], + ]); } diff --git a/src/tests.rs b/tests/tests.rs similarity index 62% rename from src/tests.rs rename to tests/tests.rs index 9e3805b..47218e4 100644 --- a/src/tests.rs +++ b/tests/tests.rs @@ -8,112 +8,10 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#[cfg(feature = "bench")] -use super::{UnicodeWidthChar, UnicodeWidthStr}; -#[cfg(feature = "bench")] -use std::iter; -#[cfg(feature = "bench")] -use test::Bencher; +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; -use std::prelude::v1::*; - -#[cfg(feature = "bench")] -#[bench] -fn cargo(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(UnicodeWidthChar::width(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -#[allow(deprecated)] -fn stdlib(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(c.width()); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -fn simple_if(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(simple_width_if(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[bench] -fn simple_match(b: &mut Bencher) { - let string = iter::repeat('a').take(4096).collect::(); - - b.iter(|| { - for c in string.chars() { - test::black_box(simple_width_match(c)); - } - }); -} - -#[cfg(feature = "bench")] -#[inline] -fn simple_width_if(c: char) -> Option { - let cu = c as u32; - if cu < 127 { - if cu > 31 { - Some(1) - } else if cu == 0 { - Some(0) - } else { - None - } - } else { - UnicodeWidthChar::width(c) - } -} - -#[cfg(feature = "bench")] -#[inline] -fn simple_width_match(c: char) -> Option { - match c as u32 { - cu if cu == 0 => Some(0), - cu if cu < 0x20 => None, - cu if cu < 0x7f => Some(1), - _ => UnicodeWidthChar::width(c), - } -} -#[cfg(feature = "bench")] -#[bench] -fn enwik8(b: &mut Bencher) { - // To benchmark, download & unzip `enwik8` from https://data.deepai.org/enwik8.zip - let data_path = "bench_data/enwik8"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} -#[cfg(feature = "bench")] -#[bench] -fn jawiki(b: &mut Bencher) { - // To benchmark, download & extract `jawiki-20220501-pages-articles-multistream-index.txt` from - // https://dumps.wikimedia.org/jawiki/20220501/jawiki-20220501-pages-articles-multistream-index.txt.bz2 - let data_path = "bench_data/jawiki-20220501-pages-articles-multistream-index.txt"; - let string = std::fs::read_to_string(data_path).unwrap_or_default(); - b.iter(|| test::black_box(UnicodeWidthStr::width(string.as_str()))); -} #[test] fn test_str() { - use super::UnicodeWidthStr; - assert_eq!(UnicodeWidthStr::width("hello"), 10); assert_eq!("hello".width_cjk(), 10); assert_eq!(UnicodeWidthStr::width("\0\0\0\x01\x01"), 0); @@ -130,8 +28,6 @@ fn test_str() { #[test] fn test_emoji() { // Example from the README. - use super::UnicodeWidthStr; - assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist @@ -139,8 +35,6 @@ fn test_emoji() { #[test] fn test_char() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('h'), Some(2)); assert_eq!('h'.width_cjk(), Some(2)); assert_eq!(UnicodeWidthChar::width('\x00'), Some(0)); @@ -153,8 +47,6 @@ fn test_char() { #[test] fn test_char2() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\x00'), Some(0)); assert_eq!('\x00'.width_cjk(), Some(0)); @@ -182,15 +74,11 @@ fn test_char2() { #[test] fn unicode_12() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{1F971}'), Some(2)); } #[test] fn test_default_ignorable() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{E0000}'), Some(0)); assert_eq!(UnicodeWidthChar::width('\u{1160}'), Some(0)); @@ -200,8 +88,6 @@ fn test_default_ignorable() { #[test] fn test_jamo() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{1100}'), Some(2)); assert_eq!(UnicodeWidthChar::width('\u{A97C}'), Some(2)); // Special case: U+115F HANGUL CHOSEONG FILLER @@ -214,8 +100,6 @@ fn test_jamo() { #[test] fn test_prepended_concatenation_marks() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{0600}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{070F}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{08E2}'), Some(1)); @@ -224,8 +108,6 @@ fn test_prepended_concatenation_marks() { #[test] fn test_interlinear_annotation_chars() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{FFF9}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FFFA}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{FFFB}'), Some(1)); @@ -233,8 +115,6 @@ fn test_interlinear_annotation_chars() { #[test] fn test_hieroglyph_format_controls() { - use super::UnicodeWidthChar; - assert_eq!(UnicodeWidthChar::width('\u{13430}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{13436}'), Some(1)); assert_eq!(UnicodeWidthChar::width('\u{1343C}'), Some(1)); @@ -242,8 +122,6 @@ fn test_hieroglyph_format_controls() { #[test] fn test_marks() { - use super::UnicodeWidthChar; - // Nonspacing marks have 0 width assert_eq!(UnicodeWidthChar::width('\u{0301}'), Some(0)); // Enclosing marks have 0 width @@ -256,8 +134,6 @@ fn test_marks() { #[test] fn test_canonical_equivalence() { - use super::{UnicodeWidthChar, UnicodeWidthStr}; - for c in '\0'..='\u{10FFFF}' { let mut nfd = String::new(); unicode_normalization::char::decompose_canonical(c, |d| nfd.push(d)); @@ -272,3 +148,20 @@ fn test_canonical_equivalence() { //assert_eq!(c.width_cjk().unwrap_or(0), nfd.width_cjk(), "{c}, {nfd}"); } } + +#[test] +fn test_emoji_presentation() { + assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1)); + assert_eq!(UnicodeWidthChar::width('\u{FE0F}'), Some(0)); + assert_eq!(UnicodeWidthStr::width("\u{0023}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("a\u{0023}\u{FE0F}a"), 4); + assert_eq!(UnicodeWidthStr::width("\u{0023}a\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("a\u{FE0F}"), 1); + assert_eq!(UnicodeWidthStr::width("\u{0023}\u{0023}\u{FE0F}a"), 4); + + assert_eq!(UnicodeWidthStr::width("\u{002A}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{23F9}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{24C2}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{1F6F3}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{1F700}\u{FE0F}"), 1); +}