Skip to content

Commit 5568777

Browse files
committed
Implement rule GB10 (Emoji modifier sequences)
1 parent 74ea683 commit 5568777

File tree

1 file changed

+51
-3
lines changed

1 file changed

+51
-3
lines changed

src/grapheme.rs

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,15 @@ pub struct Graphemes<'a> {
5151
}
5252

5353
// state machine for cluster boundary rules
54-
#[derive(PartialEq,Eq)]
54+
#[derive(Copy,Clone,PartialEq,Eq)]
5555
enum GraphemeState {
5656
Start,
5757
FindExtend,
5858
HangulL,
5959
HangulLV,
6060
HangulLVT,
6161
Regional,
62+
Emoji,
6263
Zwj,
6364
}
6465

@@ -94,6 +95,10 @@ impl<'a> Iterator for Graphemes<'a> {
9495
_ => self.cat.take().unwrap()
9596
};
9697

98+
if (state, cat) == (Emoji, gr::GC_Extend) {
99+
continue; // rule GB10
100+
}
101+
97102
if let Some(new_state) = match cat {
98103
gr::GC_Extend => Some(FindExtend), // rule GB9
99104
gr::GC_SpacingMark if self.extended => Some(FindExtend), // rule GB9a
@@ -119,6 +124,7 @@ impl<'a> Iterator for Graphemes<'a> {
119124
gr::GC_LV | gr::GC_V => HangulLV,
120125
gr::GC_LVT | gr::GC_T => HangulLVT,
121126
gr::GC_Regional_Indicator => Regional,
127+
gr::GC_E_Base | gr::GC_E_Base_GAZ => Emoji,
122128
_ => FindExtend
123129
},
124130
FindExtend => { // found non-extending when looking for extending
@@ -156,8 +162,16 @@ impl<'a> Iterator for Graphemes<'a> {
156162
break;
157163
}
158164
},
165+
Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
166+
gr::GC_E_Modifier => continue,
167+
_ => {
168+
take_curr = false;
169+
break;
170+
}
171+
},
159172
Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
160-
gr::GC_Glue_After_Zwj | gr::GC_E_Base_GAZ => continue,
173+
gr::GC_Glue_After_Zwj => continue,
174+
gr::GC_E_Base_GAZ => Emoji,
161175
_ => {
162176
take_curr = false;
163177
break;
@@ -193,7 +207,8 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
193207
let mut previdx = idx;
194208
let mut state = Start;
195209
let mut cat = gr::GC_Any;
196-
for (curr, ch) in self.string.char_indices().rev() {
210+
211+
'outer: for (curr, ch) in self.string.char_indices().rev() {
197212
previdx = idx;
198213
idx = curr;
199214

@@ -225,6 +240,7 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
225240
gr::GC_Extend => FindExtend,
226241
gr::GC_SpacingMark if self.extended => FindExtend,
227242
gr::GC_ZWJ => FindExtend,
243+
gr::GC_E_Modifier => Emoji,
228244
gr::GC_Glue_After_Zwj | gr::GC_E_Base_GAZ => Zwj,
229245
gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
230246
gr::GC_V => HangulLV,
@@ -267,6 +283,38 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
267283
break;
268284
}
269285
},
286+
Emoji => { // char to right is E_Modifier
287+
// In order to decide whether to break before this E_Modifier char, we need to
288+
// scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
289+
let mut ebg_idx = None;
290+
for (startidx, prev) in self.string[..previdx].char_indices().rev() {
291+
match (ebg_idx, gr::grapheme_category(prev)) {
292+
(None, gr::GC_Extend) => continue,
293+
(None, gr::GC_E_Base) => { // rule GB10
294+
// Found an Emoji modifier sequence. Return the whole sequence.
295+
idx = startidx;
296+
break 'outer;
297+
}
298+
(None, gr::GC_E_Base_GAZ) => { // rule GB10
299+
// Keep scanning in case this is part of an ZWJ x EBJ pair.
300+
ebg_idx = Some(startidx);
301+
}
302+
(Some(_), gr::GC_ZWJ) => { // rule GB11
303+
idx = startidx;
304+
break 'outer;
305+
}
306+
_ => break
307+
}
308+
}
309+
if let Some(ebg_idx) = ebg_idx {
310+
// Found an EBG without a ZWJ before it.
311+
idx = ebg_idx;
312+
break;
313+
}
314+
// Not part of an Emoji modifier sequence. Break here.
315+
take_curr = false;
316+
break;
317+
},
270318
Zwj => match cat { // char to right is (GAZ|EBG)
271319
gr::GC_ZWJ => continue, // rule GB11: ZWJ x (GAZ|EBG)
272320
_ => {

0 commit comments

Comments
 (0)