@@ -48,17 +48,21 @@ pub struct Graphemes<'a> {
48
48
extended : bool ,
49
49
cat : Option < GraphemeCat > ,
50
50
catb : Option < GraphemeCat > ,
51
+ regional_count_back : Option < usize > ,
51
52
}
52
53
53
54
// state machine for cluster boundary rules
54
- #[ derive( PartialEq , Eq ) ]
55
+ #[ derive( Copy , Clone , PartialEq , Eq ) ]
55
56
enum GraphemeState {
56
57
Start ,
57
58
FindExtend ,
58
59
HangulL ,
59
60
HangulLV ,
60
61
HangulLVT ,
62
+ Prepend ,
61
63
Regional ,
64
+ Emoji ,
65
+ Zwj ,
62
66
}
63
67
64
68
impl < ' a > Iterator for Graphemes < ' a > {
@@ -82,6 +86,11 @@ impl<'a> Iterator for Graphemes<'a> {
82
86
let mut idx = 0 ;
83
87
let mut state = Start ;
84
88
let mut cat = gr:: GC_Any ;
89
+
90
+ // caching used by next_back() should be invalidated
91
+ self . regional_count_back = None ;
92
+ self . catb = None ;
93
+
85
94
for ( curr, ch) in self . string . char_indices ( ) {
86
95
idx = curr;
87
96
@@ -93,13 +102,18 @@ impl<'a> Iterator for Graphemes<'a> {
93
102
_ => self . cat . take ( ) . unwrap ( )
94
103
} ;
95
104
96
- if match cat {
97
- gr:: GC_Extend => true ,
98
- gr:: GC_SpacingMark if self . extended => true ,
99
- _ => false
105
+ if ( state, cat) == ( Emoji , gr:: GC_Extend ) {
106
+ continue ; // rule GB10
107
+ }
108
+
109
+ if let Some ( new_state) = match cat {
110
+ gr:: GC_Extend => Some ( FindExtend ) , // rule GB9
111
+ gr:: GC_SpacingMark if self . extended => Some ( FindExtend ) , // rule GB9a
112
+ gr:: GC_ZWJ => Some ( Zwj ) , // rule GB9/GB11
113
+ _ => None
100
114
} {
101
- state = FindExtend ; // rule GB9/GB9a
102
- continue ;
115
+ state = new_state ;
116
+ continue ;
103
117
}
104
118
105
119
state = match state {
@@ -116,7 +130,9 @@ impl<'a> Iterator for Graphemes<'a> {
116
130
gr:: GC_L => HangulL ,
117
131
gr:: GC_LV | gr:: GC_V => HangulLV ,
118
132
gr:: GC_LVT | gr:: GC_T => HangulLVT ,
133
+ gr:: GC_Prepend if self . extended => Prepend ,
119
134
gr:: GC_Regional_Indicator => Regional ,
135
+ gr:: GC_E_Base | gr:: GC_E_Base_GAZ => Emoji ,
120
136
_ => FindExtend
121
137
} ,
122
138
FindExtend => { // found non-extending when looking for extending
@@ -147,13 +163,35 @@ impl<'a> Iterator for Graphemes<'a> {
147
163
break ;
148
164
}
149
165
} ,
150
- Regional => match cat { // rule GB8a
151
- gr:: GC_Regional_Indicator => continue ,
166
+ Prepend => match cat { // rule GB9b
167
+ gr:: GC_Control => {
168
+ take_curr = false ;
169
+ break ;
170
+ }
171
+ _ => continue
172
+ } ,
173
+ Regional => match cat { // rule GB12/GB13
174
+ gr:: GC_Regional_Indicator => FindExtend ,
152
175
_ => {
153
176
take_curr = false ;
154
177
break ;
155
178
}
156
- }
179
+ } ,
180
+ Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
181
+ gr:: GC_E_Modifier => continue ,
182
+ _ => {
183
+ take_curr = false ;
184
+ break ;
185
+ }
186
+ } ,
187
+ Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
188
+ gr:: GC_Glue_After_Zwj => continue ,
189
+ gr:: GC_E_Base_GAZ => Emoji ,
190
+ _ => {
191
+ take_curr = false ;
192
+ break ;
193
+ }
194
+ } ,
157
195
}
158
196
}
159
197
@@ -184,7 +222,11 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
184
222
let mut previdx = idx;
185
223
let mut state = Start ;
186
224
let mut cat = gr:: GC_Any ;
187
- for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
225
+
226
+ // caching used by next() should be invalidated
227
+ self . cat = None ;
228
+
229
+ ' outer: for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
188
230
previdx = idx;
189
231
idx = curr;
190
232
@@ -215,6 +257,9 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
215
257
Start | FindExtend => match cat {
216
258
gr:: GC_Extend => FindExtend ,
217
259
gr:: GC_SpacingMark if self . extended => FindExtend ,
260
+ gr:: GC_ZWJ => FindExtend ,
261
+ gr:: GC_E_Modifier => Emoji ,
262
+ gr:: GC_Glue_After_Zwj | gr:: GC_E_Base_GAZ => Zwj ,
218
263
gr:: GC_L | gr:: GC_LV | gr:: GC_LVT => HangulL ,
219
264
gr:: GC_V => HangulLV ,
220
265
gr:: GC_T => HangulLVT ,
@@ -249,8 +294,62 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
249
294
break ;
250
295
}
251
296
} ,
252
- Regional => match cat { // rule GB8a
253
- gr:: GC_Regional_Indicator => continue ,
297
+ Prepend => {
298
+ // not used in reverse iteration
299
+ unreachable ! ( )
300
+ } ,
301
+ Regional => { // rule GB12/GB13
302
+ // Need to scan backward to find if this is preceded by an odd or even number
303
+ // of Regional_Indicator characters.
304
+ let count = match self . regional_count_back {
305
+ Some ( count) => count,
306
+ None => self . string [ ..previdx] . chars ( ) . rev ( ) . take_while ( |c| {
307
+ gr:: grapheme_category ( * c) == gr:: GC_Regional_Indicator
308
+ } ) . count ( )
309
+ } ;
310
+ // Cache the count to avoid re-scanning the same chars on the next iteration.
311
+ self . regional_count_back = count. checked_sub ( 1 ) ;
312
+
313
+ if count % 2 == 0 {
314
+ take_curr = false ;
315
+ break ;
316
+ }
317
+ continue ;
318
+ } ,
319
+ Emoji => { // char to right is E_Modifier
320
+ // In order to decide whether to break before this E_Modifier char, we need to
321
+ // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
322
+ let mut ebg_idx = None ;
323
+ for ( startidx, prev) in self . string [ ..previdx] . char_indices ( ) . rev ( ) {
324
+ match ( ebg_idx, gr:: grapheme_category ( prev) ) {
325
+ ( None , gr:: GC_Extend ) => continue ,
326
+ ( None , gr:: GC_E_Base ) => { // rule GB10
327
+ // Found an Emoji modifier sequence. Return the whole sequence.
328
+ idx = startidx;
329
+ break ' outer;
330
+ }
331
+ ( None , gr:: GC_E_Base_GAZ ) => { // rule GB10
332
+ // Keep scanning in case this is part of an ZWJ x EBJ pair.
333
+ ebg_idx = Some ( startidx) ;
334
+ }
335
+ ( Some ( _) , gr:: GC_ZWJ ) => { // rule GB11
336
+ idx = startidx;
337
+ break ' outer;
338
+ }
339
+ _ => break
340
+ }
341
+ }
342
+ if let Some ( ebg_idx) = ebg_idx {
343
+ // Found an EBG without a ZWJ before it.
344
+ idx = ebg_idx;
345
+ break ;
346
+ }
347
+ // Not part of an Emoji modifier sequence. Break here.
348
+ take_curr = false ;
349
+ break ;
350
+ } ,
351
+ Zwj => match cat { // char to right is (GAZ|EBG)
352
+ gr:: GC_ZWJ => continue , // rule GB11: ZWJ x (GAZ|EBG)
254
353
_ => {
255
354
take_curr = false ;
256
355
break ;
@@ -266,6 +365,19 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
266
365
Some ( cat)
267
366
} ;
268
367
368
+ if self . extended && cat != gr:: GC_Control {
369
+ // rule GB9b: include any preceding Prepend characters
370
+ for ( i, c) in self . string [ ..idx] . char_indices ( ) . rev ( ) {
371
+ match gr:: grapheme_category ( c) {
372
+ gr:: GC_Prepend => idx = i,
373
+ cat => {
374
+ self . catb = Some ( cat) ;
375
+ break ;
376
+ }
377
+ }
378
+ }
379
+ }
380
+
269
381
let retstr = & self . string [ idx..] ;
270
382
self . string = & self . string [ ..idx] ;
271
383
Some ( retstr)
@@ -274,7 +386,13 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
274
386
275
387
#[ inline]
276
388
pub fn new_graphemes < ' b > ( s : & ' b str , is_extended : bool ) -> Graphemes < ' b > {
277
- Graphemes { string : s, extended : is_extended, cat : None , catb : None }
389
+ Graphemes {
390
+ string : s,
391
+ extended : is_extended,
392
+ cat : None ,
393
+ catb : None ,
394
+ regional_count_back : None
395
+ }
278
396
}
279
397
280
398
#[ inline]
0 commit comments