@@ -80,7 +80,7 @@ enum UWordBoundsState {
80
80
Numeric ,
81
81
Katakana ,
82
82
ExtendNumLet ,
83
- Regional ( /* half */ bool ) ,
83
+ Regional ( RegionalState ) ,
84
84
FormatExtend ( FormatExtendType ) ,
85
85
Zwj ( /* tainted */ bool ) ,
86
86
Emoji ,
@@ -97,6 +97,13 @@ enum FormatExtendType {
97
97
RequireNumeric ,
98
98
}
99
99
100
+ #[ derive( Clone , Copy , PartialEq , Eq , Debug ) ]
101
+ enum RegionalState {
102
+ Half ,
103
+ Full ,
104
+ Unknown ,
105
+ }
106
+
100
107
impl < ' a > Iterator for UWordBounds < ' a > {
101
108
type Item = & ' a str ;
102
109
@@ -184,7 +191,7 @@ impl<'a> Iterator for UWordBounds<'a> {
184
191
wd:: WC_Numeric => Numeric , // rule WB8, WB10, WB12, WB13a
185
192
wd:: WC_Katakana => Katakana , // rule WB13, WB13a
186
193
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
187
- wd:: WC_Regional_Indicator => Regional ( /* half = */ true ) , // rule WB13c
194
+ wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
188
195
wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
189
196
wd:: WC_ZWJ => Zwj ( false ) , // rule WB3c
190
197
wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
@@ -269,20 +276,21 @@ impl<'a> Iterator for UWordBounds<'a> {
269
276
break ;
270
277
}
271
278
} ,
272
- Regional ( false ) => {
279
+ Regional ( RegionalState :: Full ) => {
273
280
// if it reaches here we've gone too far,
274
281
// a full flag can only compose with ZWJ/Extend/Format
275
282
// proceeding it.
276
283
take_curr = false ;
277
284
break ;
278
285
}
279
- Regional ( /* half */ true ) => match cat {
280
- wd:: WC_Regional_Indicator => Regional ( false ) , // rule WB13c
286
+ Regional ( RegionalState :: Half ) => match cat {
287
+ wd:: WC_Regional_Indicator => Regional ( RegionalState :: Full ) , // rule WB13c
281
288
_ => {
282
289
take_curr = false ;
283
290
break ;
284
291
}
285
292
} ,
293
+ Regional ( _) => unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
286
294
Emoji => match cat { // rule WB14
287
295
wd:: WC_E_Modifier => continue ,
288
296
_ => {
@@ -392,7 +400,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
392
400
wd:: WC_Numeric => Numeric , // rule WB8, WB9, WB11, WB13b
393
401
wd:: WC_Katakana => Katakana , // rule WB13, WB13b
394
402
wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
395
- wd:: WC_Regional_Indicator => Regional ( true ) , // rule WB13c
403
+ wd:: WC_Regional_Indicator => Regional ( RegionalState :: Unknown ) , // rule WB13c
396
404
wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj ( false ) , // rule WB3c
397
405
// rule WB4:
398
406
wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
@@ -474,8 +482,31 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
474
482
break ;
475
483
}
476
484
} ,
477
- Regional ( _) => match cat {
478
- wd:: WC_Regional_Indicator => Regional ( true ) , // rule WB13c
485
+ Regional ( mut regional_state) => match cat {
486
+ // rule WB13c
487
+ wd:: WC_Regional_Indicator => {
488
+ if regional_state == RegionalState :: Unknown {
489
+ let count = self . string [ ..previdx]
490
+ . chars ( ) . rev ( )
491
+ . map ( |c| wd:: word_category ( c) )
492
+ // Ignore because of WB4
493
+ // Combining characters *inside* flag emoji. Yay.
494
+ . filter ( |& c| ! ( c == wd:: WC_ZWJ || c == wd:: WC_Extend || c == wd:: WC_Format ) )
495
+ . take_while ( |& c| c == wd:: WC_Regional_Indicator )
496
+ . count ( ) ;
497
+ regional_state = if count % 2 == 0 {
498
+ RegionalState :: Full
499
+ } else {
500
+ RegionalState :: Half
501
+ } ;
502
+ }
503
+ if regional_state == RegionalState :: Full {
504
+ take_curr = false ;
505
+ break ;
506
+ } else {
507
+ Regional ( RegionalState :: Full )
508
+ }
509
+ }
479
510
_ => {
480
511
take_curr = false ;
481
512
break ;
0 commit comments