Skip to content

Commit e3754bc

Browse files
committed
Implement new GB12/GB13 rules for RI sequences
1 parent 5568777 commit e3754bc

File tree

2 files changed

+14
-6
lines changed

2 files changed

+14
-6
lines changed

src/grapheme.rs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,8 @@ impl<'a> Iterator for Graphemes<'a> {
155155
break;
156156
}
157157
},
158-
Regional => match cat { // rule GB8a
159-
gr::GC_Regional_Indicator => continue,
158+
Regional => match cat { // rule GB12/GB13
159+
gr::GC_Regional_Indicator => FindExtend,
160160
_ => {
161161
take_curr = false;
162162
break;
@@ -276,12 +276,20 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
276276
break;
277277
}
278278
},
279-
Regional => match cat { // rule GB8a
280-
gr::GC_Regional_Indicator => continue,
281-
_ => {
279+
Regional => { // rule GB12/GB13
280+
// Need to scan backward to find if this is preceded by an odd or even number
281+
// of Regional_Indicator characters.
282+
//
283+
// TODO: Save this state to avoid O(n^2) re-scanning in long RI sequences?
284+
let prev_chars = self.string[..previdx].chars().rev();
285+
let count = prev_chars.take_while(|c| {
286+
gr::grapheme_category(*c) == gr::GC_Regional_Indicator
287+
}).count();
288+
if count % 2 == 0 {
282289
take_curr = false;
283290
break;
284291
}
292+
continue;
285293
},
286294
Emoji => { // char to right is E_Modifier
287295
// In order to decide whether to break before this E_Modifier char, we need to

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ pub trait UnicodeSegmentation {
9696
/// assert_eq!(&gr1[..], b);
9797
///
9898
/// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
99-
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺🇸🇹"];
99+
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
100100
///
101101
/// assert_eq!(&gr2[..], b);
102102
/// ```

0 commit comments

Comments
 (0)