@@ -51,14 +51,15 @@ pub struct Graphemes<'a> {
51
51
}
52
52
53
53
// state machine for cluster boundary rules
54
- #[ derive( PartialEq , Eq ) ]
54
+ #[ derive( Copy , Clone , PartialEq , Eq ) ]
55
55
enum GraphemeState {
56
56
Start ,
57
57
FindExtend ,
58
58
HangulL ,
59
59
HangulLV ,
60
60
HangulLVT ,
61
61
Regional ,
62
+ Emoji ,
62
63
Zwj ,
63
64
}
64
65
@@ -94,6 +95,10 @@ impl<'a> Iterator for Graphemes<'a> {
94
95
_ => self . cat . take ( ) . unwrap ( )
95
96
} ;
96
97
98
+ if ( state, cat) == ( Emoji , gr:: GC_Extend ) {
99
+ continue ; // rule GB10
100
+ }
101
+
97
102
if let Some ( new_state) = match cat {
98
103
gr:: GC_Extend => Some ( FindExtend ) , // rule GB9
99
104
gr:: GC_SpacingMark if self . extended => Some ( FindExtend ) , // rule GB9a
@@ -119,6 +124,7 @@ impl<'a> Iterator for Graphemes<'a> {
119
124
gr:: GC_LV | gr:: GC_V => HangulLV ,
120
125
gr:: GC_LVT | gr:: GC_T => HangulLVT ,
121
126
gr:: GC_Regional_Indicator => Regional ,
127
+ gr:: GC_E_Base | gr:: GC_E_Base_GAZ => Emoji ,
122
128
_ => FindExtend
123
129
} ,
124
130
FindExtend => { // found non-extending when looking for extending
@@ -156,8 +162,16 @@ impl<'a> Iterator for Graphemes<'a> {
156
162
break ;
157
163
}
158
164
} ,
165
+ Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
166
+ gr:: GC_E_Modifier => continue ,
167
+ _ => {
168
+ take_curr = false ;
169
+ break ;
170
+ }
171
+ } ,
159
172
Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
160
- gr:: GC_Glue_After_Zwj | gr:: GC_E_Base_GAZ => continue ,
173
+ gr:: GC_Glue_After_Zwj => continue ,
174
+ gr:: GC_E_Base_GAZ => Emoji ,
161
175
_ => {
162
176
take_curr = false ;
163
177
break ;
@@ -193,7 +207,8 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
193
207
let mut previdx = idx;
194
208
let mut state = Start ;
195
209
let mut cat = gr:: GC_Any ;
196
- for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
210
+
211
+ ' outer: for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
197
212
previdx = idx;
198
213
idx = curr;
199
214
@@ -225,6 +240,7 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
225
240
gr:: GC_Extend => FindExtend ,
226
241
gr:: GC_SpacingMark if self . extended => FindExtend ,
227
242
gr:: GC_ZWJ => FindExtend ,
243
+ gr:: GC_E_Modifier => Emoji ,
228
244
gr:: GC_Glue_After_Zwj | gr:: GC_E_Base_GAZ => Zwj ,
229
245
gr:: GC_L | gr:: GC_LV | gr:: GC_LVT => HangulL ,
230
246
gr:: GC_V => HangulLV ,
@@ -267,6 +283,38 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
267
283
break ;
268
284
}
269
285
} ,
286
+ Emoji => { // char to right is E_Modifier
287
+ // In order to decide whether to break before this E_Modifier char, we need to
288
+ // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
289
+ let mut ebg_idx = None ;
290
+ for ( startidx, prev) in self . string [ ..previdx] . char_indices ( ) . rev ( ) {
291
+ match ( ebg_idx, gr:: grapheme_category ( prev) ) {
292
+ ( None , gr:: GC_Extend ) => continue ,
293
+ ( None , gr:: GC_E_Base ) => { // rule GB10
294
+ // Found an Emoji modifier sequence. Return the whole sequence.
295
+ idx = startidx;
296
+ break ' outer;
297
+ }
298
+ ( None , gr:: GC_E_Base_GAZ ) => { // rule GB10
299
+ // Keep scanning in case this is part of an ZWJ x EBJ pair.
300
+ ebg_idx = Some ( startidx) ;
301
+ }
302
+ ( Some ( _) , gr:: GC_ZWJ ) => { // rule GB11
303
+ idx = startidx;
304
+ break ' outer;
305
+ }
306
+ _ => break
307
+ }
308
+ }
309
+ if let Some ( ebg_idx) = ebg_idx {
310
+ // Found an EBG without a ZWJ before it.
311
+ idx = ebg_idx;
312
+ break ;
313
+ }
314
+ // Not part of an Emoji modifier sequence. Break here.
315
+ take_curr = false ;
316
+ break ;
317
+ } ,
270
318
Zwj => match cat { // char to right is (GAZ|EBG)
271
319
gr:: GC_ZWJ => continue , // rule GB11: ZWJ x (GAZ|EBG)
272
320
_ => {
0 commit comments