From 4bd52ab7f895a311ba3cd2b090f7daff41b8b1a2 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 9 Jun 2024 08:29:28 -0400 Subject: [PATCH] Make characters with `Line_Break=Ambiguous` ambiguous --- scripts/unicode.py | 12 ++++++++++++ src/lib.rs | 11 ++++++++--- src/tables.rs | 24 ++++++++++++------------ tests/tests.rs | 7 +++++++ 4 files changed, 39 insertions(+), 15 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index aa0d86b..e7a0b71 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -15,14 +15,19 @@ # - DerivedCoreProperties.txt # - EastAsianWidth.txt # - HangulSyllableType.txt +# - LineBreak.txt # - NormalizationTest.txt (for tests only) # - PropList.txt # - ReadMe.txt # - UnicodeData.txt # - auxiliary/GraphemeBreakProperty.txt # - emoji/emoji-data.txt +# - emoji/emoji-test.txt (for tests only) # - emoji/emoji-variation-sequences.txt +# - extracted/DerivedCombiningClass.txt # - extracted/DerivedGeneralCategory.txt +# - extracted/DerivedJoiningGroup.txt +# - extracted/DerivedJoiningType.txt # # Since this should not require frequent updates, we just store this # out-of-line and check the generated module into git. @@ -429,6 +434,13 @@ def load_east_asian_widths() -> list[EastAsianWidth]: # Catch any leftover codepoints and assign them implicit Neutral/narrow width. width_map.append(EastAsianWidth.NARROW) + # Characters with ambiguous line breaking are ambiguous + load_property( + "LineBreak.txt", + "AI", + lambda cp: (operator.setitem(width_map, cp, EastAsianWidth.AMBIGUOUS)), + ) + # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow load_property( "extracted/DerivedGeneralCategory.txt", diff --git a/src/lib.rs b/src/lib.rs index 71b5d70..2b1c4d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -119,9 +119,11 @@ //! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. //! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise: -//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or -//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or -//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and +//! - Fulfills one of the following conditions: +//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or +//! - Has a [`Line_Break`] of [`AI`], or +//! - Has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or +//! - Is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387); and //! - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`. //! 7. All other characters have width 1. //! @@ -138,6 +140,7 @@ //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593 //! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862 //! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009 +//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5 //! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908 //! [`Script`]: https://www.unicode.org/reports/tr24/#Script //! @@ -145,6 +148,8 @@ //! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4 //! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6 //! +//! [`AI`]: https://www.unicode.org/reports/tr14/#AI +//! //! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602 //! //! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence diff --git a/src/tables.rs b/src/tables.rs index fa632d6..c9c017b 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1030,8 +1030,8 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([ ], #[cfg(feature = "cjk")] [ - 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, - 0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0x39, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, + 0xAD, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xAF, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, @@ -1878,7 +1878,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ #[cfg(feature = "cjk")] [ 0x95, 0x59, 0x59, 0x55, 0x55, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x5A, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0x5A, 0x55, ], #[cfg(feature = "cjk")] @@ -1914,13 +1914,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ #[cfg(feature = "cjk")] [ 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, - ], - #[cfg(feature = "cjk")] - [ - 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, + 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56, 0x55, 0x55, ], #[cfg(feature = "cjk")] @@ -1931,7 +1925,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], #[cfg(feature = "cjk")] [ - 0x55, 0x69, 0x59, 0xA5, 0x55, 0x5F, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x69, 0x59, 0xA5, 0x55, 0xAF, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x66, 0x55, 0xFF, 0xFF, 0xFF, 0x55, 0x55, 0x55, 0x9A, 0x9A, 0x6A, 0x9A, 0x55, 0x55, 0x55, 0xD5, ], @@ -1948,6 +1942,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0xAA, 0xAA, ], #[cfg(feature = "cjk")] + [ + 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xFD, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0x55, + 0xD5, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, + ], + #[cfg(feature = "cjk")] [ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xD5, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0xAD, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, @@ -1973,7 +1973,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], #[cfg(feature = "cjk")] [ - 0xAA, 0xAA, 0x6A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA, + 0xAA, 0xAA, 0xAA, 0x56, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, ], diff --git a/tests/tests.rs b/tests/tests.rs index 2940df2..7dc5b61 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -588,6 +588,13 @@ fn emoji_test_file() { } } +#[test] +fn ambiguous_line_break() { + assert_width!("\u{24EA}", 1, 2); + assert_width!("\u{2616}", 1, 2); + assert_width!("\u{2780}", 1, 2); +} + // Test traits are unsealed #[cfg(feature = "cjk")]