Skip to content

Commit 605dc7d

Browse files
Manishearthmbrubeck
authored andcommitted
Reverse word iteration -- get regional indicators working
1 parent f3ea31d commit 605dc7d

File tree

2 files changed

+57
-21
lines changed

2 files changed

+57
-21
lines changed

src/test.rs

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,22 @@ fn test_words() {
9393

9494
for &(s, w) in TEST_WORD {
9595
macro_rules! assert_ {
96-
($x:expr) => (assert!($x, "Word test {} for testcase ({:?}, {:?}) failed", stringify!($x), s, w))
96+
($test:expr, $exp:expr, $name:expr) => {
97+
// collect into vector for better diagnostics in failure case
98+
let testing = $test.collect::<Vec<_>>();
99+
let expected = $exp.collect::<Vec<_>>();
100+
assert_eq!(testing, expected, "{} test for testcase ({:?}, {:?}) failed.", $name, s, w)
101+
}
97102
}
98103
// test forward iterator
99-
assert_!(s.split_word_bounds()
100-
.zip(w.iter().cloned())
101-
.all(|(a,b)| a == b));
104+
assert_!(s.split_word_bounds(),
105+
w.iter().cloned(),
106+
"Forward word boundaries");
102107

103108
// test reverse iterator
104-
assert_!(s.split_word_bounds().rev()
105-
.zip(w.iter().rev().cloned())
106-
.all(|(a,b)| a == b));
109+
assert_!(s.split_word_bounds().rev(),
110+
w.iter().rev().cloned(),
111+
"Reverse word boundaries");
107112

108113
// generate offsets from word string lengths
109114
let mut indices = vec![0];
@@ -114,13 +119,13 @@ fn test_words() {
114119
let indices = indices;
115120

116121
// test forward indices iterator
117-
assert_!(s.split_word_bound_indices()
118-
.zip(indices.iter())
119-
.all(|((l,_),m)| l == *m));
122+
assert_!(s.split_word_bound_indices().map(|(l,_)| l),
123+
indices.iter().cloned(),
124+
"Forward word indices");
120125

121126
// test backward indices iterator
122-
assert_!(s.split_word_bound_indices().rev()
123-
.zip(indices.iter().rev())
124-
.all(|((l,_),m)| l == *m));
127+
assert_!(s.split_word_bound_indices().rev().map(|(l,_)| l),
128+
indices.iter().rev().cloned(),
129+
"Reverse word indices");
125130
}
126131
}

src/word.rs

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ enum UWordBoundsState {
8080
Numeric,
8181
Katakana,
8282
ExtendNumLet,
83-
Regional(/* half */ bool),
83+
Regional(RegionalState),
8484
FormatExtend(FormatExtendType),
8585
Zwj(/* tainted */ bool),
8686
Emoji,
@@ -97,6 +97,13 @@ enum FormatExtendType {
9797
RequireNumeric,
9898
}
9999

100+
#[derive(Clone,Copy,PartialEq,Eq,Debug)]
101+
enum RegionalState {
102+
Half,
103+
Full,
104+
Unknown,
105+
}
106+
100107
impl<'a> Iterator for UWordBounds<'a> {
101108
type Item = &'a str;
102109

@@ -184,7 +191,7 @@ impl<'a> Iterator for UWordBounds<'a> {
184191
wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
185192
wd::WC_Katakana => Katakana, // rule WB13, WB13a
186193
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
187-
wd::WC_Regional_Indicator => Regional(/* half = */ true), // rule WB13c
194+
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
188195
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
189196
wd::WC_ZWJ => Zwj(false), // rule WB3c
190197
wd::WC_E_Base | wd::WC_E_Base_GAZ => Emoji, // rule WB14
@@ -269,20 +276,21 @@ impl<'a> Iterator for UWordBounds<'a> {
269276
break;
270277
}
271278
},
272-
Regional(false) => {
279+
Regional(RegionalState::Full) => {
273280
// if it reaches here we've gone too far,
274281
// a full flag can only compose with ZWJ/Extend/Format
275282
// proceeding it.
276283
take_curr = false;
277284
break;
278285
}
279-
Regional(/* half */ true) => match cat {
280-
wd::WC_Regional_Indicator => Regional(false), // rule WB13c
286+
Regional(RegionalState::Half) => match cat {
287+
wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
281288
_ => {
282289
take_curr = false;
283290
break;
284291
}
285292
},
293+
Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
286294
Emoji => match cat { // rule WB14
287295
wd::WC_E_Modifier => continue,
288296
_ => {
@@ -392,7 +400,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
392400
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
393401
wd::WC_Katakana => Katakana, // rule WB13, WB13b
394402
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
395-
wd::WC_Regional_Indicator => Regional(true), // rule WB13c
403+
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
396404
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj(false), // rule WB3c
397405
// rule WB4:
398406
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
@@ -474,8 +482,31 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
474482
break;
475483
}
476484
},
477-
Regional(_) => match cat {
478-
wd::WC_Regional_Indicator => Regional(true), // rule WB13c
485+
Regional(mut regional_state) => match cat {
486+
// rule WB13c
487+
wd::WC_Regional_Indicator => {
488+
if regional_state == RegionalState::Unknown {
489+
let count = self.string[..previdx]
490+
.chars().rev()
491+
.map(|c| wd::word_category(c))
492+
// Ignore because of WB4
493+
// Combining characters *inside* flag emoji. Yay.
494+
.filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
495+
.take_while(|&c| c == wd::WC_Regional_Indicator)
496+
.count();
497+
regional_state = if count % 2 == 0 {
498+
RegionalState::Full
499+
} else {
500+
RegionalState::Half
501+
};
502+
}
503+
if regional_state == RegionalState::Full {
504+
take_curr = false;
505+
break;
506+
} else {
507+
Regional(RegionalState::Full)
508+
}
509+
}
479510
_ => {
480511
take_curr = false;
481512
break;

0 commit comments

Comments
 (0)