Skip to content

Commit acaacf2

Browse files
Make characters with Line_Break=Ambiguous ambiguous
1 parent afab363 commit acaacf2

File tree

4 files changed

+39
-15
lines changed

4 files changed

+39
-15
lines changed

scripts/unicode.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,19 @@
1515
# - DerivedCoreProperties.txt
1616
# - EastAsianWidth.txt
1717
# - HangulSyllableType.txt
18+
# - LineBreak.txt
1819
# - NormalizationTest.txt (for tests only)
1920
# - PropList.txt
2021
# - ReadMe.txt
2122
# - Scripts.txt
2223
# - UnicodeData.txt
2324
# - emoji/emoji-data.txt
25+
# - emoji/emoji-test.txt (for tests only)
2426
# - emoji/emoji-variation-sequences.txt
27+
# - extracted/DerivedCombiningClass.txt
2528
# - extracted/DerivedGeneralCategory.txt
29+
# - extracted/DerivedJoiningGroup.txt
30+
# - extracted/DerivedJoiningType.txt
2631
#
2732
# Since this should not require frequent updates, we just store this
2833
# out-of-line and check the generated module into git.
@@ -429,6 +434,13 @@ def load_east_asian_widths() -> list[EastAsianWidth]:
429434
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
430435
width_map.append(EastAsianWidth.NARROW)
431436

437+
# Characters with ambiguous line breaking are ambiguous
438+
load_property(
439+
"LineBreak.txt",
440+
"AI",
441+
lambda cp: (operator.setitem(width_map, cp, EastAsianWidth.AMBIGUOUS)),
442+
)
443+
432444
# Characters from alphabetic scripts are narrow
433445
load_property(
434446
"Scripts.txt",

src/lib.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,11 @@
117117
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
118118
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
119119
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
120-
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
121-
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
122-
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
120+
//! - Fulfills one of the following conditions:
121+
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
122+
//! - Has a [`Line_Break`] of [`AI`], or
123+
//! - Has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
124+
//! - Is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387); and
123125
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
124126
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
125127
//! 7. All other characters have width 1.
@@ -136,12 +138,15 @@
136138
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
137139
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
138140
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
141+
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
139142
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
140143
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
141144
//!
142145
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
143146
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
144147
//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
148+
//!
149+
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
145150
//!
146151
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
147152
//!

src/tables.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,8 +1030,8 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
10301030
],
10311031
#[cfg(feature = "cjk")]
10321032
[
1033-
0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE,
1034-
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
1033+
0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0x39, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
1034+
0xAE, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
10351035
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xB0, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10361036
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10371037
0x39, 0x39, 0x39, 0x39,
@@ -1884,7 +1884,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
18841884
#[cfg(feature = "cjk")]
18851885
[
18861886
0x95, 0x59, 0x59, 0x55, 0x95, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1887-
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
1887+
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x5A, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
18881888
0x5A, 0x55,
18891889
],
18901890
#[cfg(feature = "cjk")]
@@ -1920,13 +1920,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19201920
#[cfg(feature = "cjk")]
19211921
[
19221922
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
1923-
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A, 0xAA, 0xAA, 0xAA,
1924-
0xAA, 0xAA,
1925-
],
1926-
#[cfg(feature = "cjk")]
1927-
[
1928-
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
1929-
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55,
1923+
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56,
19301924
0x55, 0x55,
19311925
],
19321926
#[cfg(feature = "cjk")]
@@ -1937,7 +1931,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19371931
],
19381932
#[cfg(feature = "cjk")]
19391933
[
1940-
0x55, 0x69, 0x59, 0xA5, 0x55, 0x5F, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1934+
0x55, 0x69, 0x59, 0xA5, 0x55, 0xAF, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
19411935
0x55, 0x66, 0x55, 0xFF, 0xFF, 0xFF, 0x55, 0x55, 0x55, 0x9A, 0x9A, 0x6A, 0x9A, 0x55, 0x55,
19421936
0x55, 0xD5,
19431937
],
@@ -1954,6 +1948,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19541948
0xAA, 0xAA,
19551949
],
19561950
#[cfg(feature = "cjk")]
1951+
[
1952+
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xFD, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0x55,
1953+
0xD5, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1954+
0x55, 0x55,
1955+
],
1956+
#[cfg(feature = "cjk")]
19571957
[
19581958
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xD5, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
19591959
0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0xAD, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
@@ -1979,7 +1979,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19791979
],
19801980
#[cfg(feature = "cjk")]
19811981
[
1982-
0xAA, 0xAA, 0x6A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA,
1982+
0xAA, 0xAA, 0xAA, 0x56, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA,
19831983
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0xAA, 0xAA,
19841984
0xAA, 0xAA,
19851985
],

tests/tests.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,13 @@ fn emoji_test_file() {
574574
}
575575
}
576576

577+
#[test]
578+
fn ambiguous_line_break() {
579+
assert_width!("\u{24EA}", 1, 2);
580+
assert_width!("\u{2616}", 1, 2);
581+
assert_width!("\u{2780}", 1, 2);
582+
}
583+
577584
// Test traits are unsealed
578585

579586
#[cfg(feature = "cjk")]

0 commit comments

Comments
 (0)