Skip to content

Commit 4d58f18

Browse files
committed
Update grapheme segmentation to Unicode 11
1 parent 504ba99 commit 4d58f18

File tree

2 files changed

+14
-11
lines changed

2 files changed

+14
-11
lines changed

src/grapheme.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ enum GraphemeState {
147147
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
148148
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
149149
Regional,
150-
// The codepoint after is in the E_Modifier category, so whether it's a boundary
151-
// depends on pre-context according to GB10.
150+
// The codepoint after is Extended_Pictographic,
151+
// so whether it's a boundary depends on pre-context according to GB11.
152152
Emoji,
153153
}
154154

@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
239239
(_, GC_ZWJ) => NotBreak, // GB9
240240
(_, GC_SpacingMark) => Extended, // GB9a
241241
(GC_Prepend, _) => Extended, // GB9b
242-
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
243-
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
244-
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
245-
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
246-
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
242+
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
247243
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
248244
(_, _) => Break, // GB999
249245
}
@@ -415,10 +411,17 @@ impl GraphemeCursor {
415411

416412
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
417413
use tables::grapheme as gr;
418-
for ch in chunk.chars().rev() {
414+
let mut iter = chunk.chars().rev();
415+
if let Some(ch) = iter.next() {
416+
if gr::grapheme_category(ch) != gr::GC_ZWJ {
417+
self.decide(true);
418+
return;
419+
}
420+
}
421+
for ch in iter {
419422
match gr::grapheme_category(ch) {
420423
gr::GC_Extend => (),
421-
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
424+
gr::GC_Extended_Pictographic => {
422425
self.decide(false);
423426
return;
424427
}
@@ -484,7 +487,7 @@ impl GraphemeCursor {
484487
let mut need_pre_context = true;
485488
match self.cat_after.unwrap() {
486489
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
487-
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
490+
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
488491
_ => need_pre_context = self.cat_before.is_none(),
489492
}
490493
if need_pre_context {

src/test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ fn test_graphemes() {
3737
// cartwheel emoji followed by two fitzpatrick skin tone modifiers
3838
// (test case from issue #19)
3939
("\u{1F938}\u{1F3FE}\u{1F3FE}",
40-
&["\u{1F938}\u{1F3FE}", "\u{1F3FE}"]),
40+
&["\u{1F938}\u{1F3FE}\u{1F3FE}"]),
4141
];
4242

4343
for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {

0 commit comments

Comments
 (0)