Skip to content

Commit 83adfde

Browse files
committed
Merge branch 'mbrubeck-unicode-9-wip'
2 parents 10bd2ed + d402895 commit 83adfde

File tree

9 files changed

+1882
-886
lines changed

9 files changed

+1882
-886
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "unicode-segmentation"
4-
version = "0.1.3"
4+
version = "1.0.0"
55
authors = ["kwantam <kwantam@gmail.com>"]
66

77
homepage = "https://github.com/unicode-rs/unicode-segmentation"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,5 @@ to your `Cargo.toml`:
4040

4141
```toml
4242
[dependencies]
43-
unicode-segmentation = "0.1.3"
43+
unicode-segmentation = "1.0.0"
4444
```

scripts/unicode.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@ def emit_property_module(f, mod, tbl, emit):
257257
def emit_break_module(f, break_table, break_cats, name):
258258
Name = name.capitalize()
259259
f.write("""pub mod %s {
260-
use core::slice::SliceExt;
261260
use core::result::Result::{Ok, Err};
262261
263262
pub use self::%sCat::*;

src/grapheme.rs

Lines changed: 132 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,21 @@ pub struct Graphemes<'a> {
4848
extended: bool,
4949
cat: Option<GraphemeCat>,
5050
catb: Option<GraphemeCat>,
51+
regional_count_back: Option<usize>,
5152
}
5253

5354
// state machine for cluster boundary rules
54-
#[derive(PartialEq,Eq)]
55+
#[derive(Copy,Clone,PartialEq,Eq)]
5556
enum GraphemeState {
5657
Start,
5758
FindExtend,
5859
HangulL,
5960
HangulLV,
6061
HangulLVT,
62+
Prepend,
6163
Regional,
64+
Emoji,
65+
Zwj,
6266
}
6367

6468
impl<'a> Iterator for Graphemes<'a> {
@@ -82,6 +86,11 @@ impl<'a> Iterator for Graphemes<'a> {
8286
let mut idx = 0;
8387
let mut state = Start;
8488
let mut cat = gr::GC_Any;
89+
90+
// caching used by next_back() should be invalidated
91+
self.regional_count_back = None;
92+
self.catb = None;
93+
8594
for (curr, ch) in self.string.char_indices() {
8695
idx = curr;
8796

@@ -93,13 +102,18 @@ impl<'a> Iterator for Graphemes<'a> {
93102
_ => self.cat.take().unwrap()
94103
};
95104

96-
if match cat {
97-
gr::GC_Extend => true,
98-
gr::GC_SpacingMark if self.extended => true,
99-
_ => false
105+
if (state, cat) == (Emoji, gr::GC_Extend) {
106+
continue; // rule GB10
107+
}
108+
109+
if let Some(new_state) = match cat {
110+
gr::GC_Extend => Some(FindExtend), // rule GB9
111+
gr::GC_SpacingMark if self.extended => Some(FindExtend), // rule GB9a
112+
gr::GC_ZWJ => Some(Zwj), // rule GB9/GB11
113+
_ => None
100114
} {
101-
state = FindExtend; // rule GB9/GB9a
102-
continue;
115+
state = new_state;
116+
continue;
103117
}
104118

105119
state = match state {
@@ -116,7 +130,9 @@ impl<'a> Iterator for Graphemes<'a> {
116130
gr::GC_L => HangulL,
117131
gr::GC_LV | gr::GC_V => HangulLV,
118132
gr::GC_LVT | gr::GC_T => HangulLVT,
133+
gr::GC_Prepend if self.extended => Prepend,
119134
gr::GC_Regional_Indicator => Regional,
135+
gr::GC_E_Base | gr::GC_E_Base_GAZ => Emoji,
120136
_ => FindExtend
121137
},
122138
FindExtend => { // found non-extending when looking for extending
@@ -147,13 +163,35 @@ impl<'a> Iterator for Graphemes<'a> {
147163
break;
148164
}
149165
},
150-
Regional => match cat { // rule GB8a
151-
gr::GC_Regional_Indicator => continue,
166+
Prepend => match cat { // rule GB9b
167+
gr::GC_Control => {
168+
take_curr = false;
169+
break;
170+
}
171+
_ => continue
172+
},
173+
Regional => match cat { // rule GB12/GB13
174+
gr::GC_Regional_Indicator => FindExtend,
152175
_ => {
153176
take_curr = false;
154177
break;
155178
}
156-
}
179+
},
180+
Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
181+
gr::GC_E_Modifier => continue,
182+
_ => {
183+
take_curr = false;
184+
break;
185+
}
186+
},
187+
Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
188+
gr::GC_Glue_After_Zwj => continue,
189+
gr::GC_E_Base_GAZ => Emoji,
190+
_ => {
191+
take_curr = false;
192+
break;
193+
}
194+
},
157195
}
158196
}
159197

@@ -184,7 +222,11 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
184222
let mut previdx = idx;
185223
let mut state = Start;
186224
let mut cat = gr::GC_Any;
187-
for (curr, ch) in self.string.char_indices().rev() {
225+
226+
// caching used by next() should be invalidated
227+
self.cat = None;
228+
229+
'outer: for (curr, ch) in self.string.char_indices().rev() {
188230
previdx = idx;
189231
idx = curr;
190232

@@ -215,6 +257,9 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
215257
Start | FindExtend => match cat {
216258
gr::GC_Extend => FindExtend,
217259
gr::GC_SpacingMark if self.extended => FindExtend,
260+
gr::GC_ZWJ => FindExtend,
261+
gr::GC_E_Modifier => Emoji,
262+
gr::GC_Glue_After_Zwj | gr::GC_E_Base_GAZ => Zwj,
218263
gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
219264
gr::GC_V => HangulLV,
220265
gr::GC_T => HangulLVT,
@@ -249,8 +294,62 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
249294
break;
250295
}
251296
},
252-
Regional => match cat { // rule GB8a
253-
gr::GC_Regional_Indicator => continue,
297+
Prepend => {
298+
// not used in reverse iteration
299+
unreachable!()
300+
},
301+
Regional => { // rule GB12/GB13
302+
// Need to scan backward to find if this is preceded by an odd or even number
303+
// of Regional_Indicator characters.
304+
let count = match self.regional_count_back {
305+
Some(count) => count,
306+
None => self.string[..previdx].chars().rev().take_while(|c| {
307+
gr::grapheme_category(*c) == gr::GC_Regional_Indicator
308+
}).count()
309+
};
310+
// Cache the count to avoid re-scanning the same chars on the next iteration.
311+
self.regional_count_back = count.checked_sub(1);
312+
313+
if count % 2 == 0 {
314+
take_curr = false;
315+
break;
316+
}
317+
continue;
318+
},
319+
Emoji => { // char to right is E_Modifier
320+
// In order to decide whether to break before this E_Modifier char, we need to
321+
// scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
322+
let mut ebg_idx = None;
323+
for (startidx, prev) in self.string[..previdx].char_indices().rev() {
324+
match (ebg_idx, gr::grapheme_category(prev)) {
325+
(None, gr::GC_Extend) => continue,
326+
(None, gr::GC_E_Base) => { // rule GB10
327+
// Found an Emoji modifier sequence. Return the whole sequence.
328+
idx = startidx;
329+
break 'outer;
330+
}
331+
(None, gr::GC_E_Base_GAZ) => { // rule GB10
332+
// Keep scanning in case this is part of an ZWJ x EBJ pair.
333+
ebg_idx = Some(startidx);
334+
}
335+
(Some(_), gr::GC_ZWJ) => { // rule GB11
336+
idx = startidx;
337+
break 'outer;
338+
}
339+
_ => break
340+
}
341+
}
342+
if let Some(ebg_idx) = ebg_idx {
343+
// Found an EBG without a ZWJ before it.
344+
idx = ebg_idx;
345+
break;
346+
}
347+
// Not part of an Emoji modifier sequence. Break here.
348+
take_curr = false;
349+
break;
350+
},
351+
Zwj => match cat { // char to right is (GAZ|EBG)
352+
gr::GC_ZWJ => continue, // rule GB11: ZWJ x (GAZ|EBG)
254353
_ => {
255354
take_curr = false;
256355
break;
@@ -266,6 +365,19 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
266365
Some(cat)
267366
};
268367

368+
if self.extended && cat != gr::GC_Control {
369+
// rule GB9b: include any preceding Prepend characters
370+
for (i, c) in self.string[..idx].char_indices().rev() {
371+
match gr::grapheme_category(c) {
372+
gr::GC_Prepend => idx = i,
373+
cat => {
374+
self.catb = Some(cat);
375+
break;
376+
}
377+
}
378+
}
379+
}
380+
269381
let retstr = &self.string[idx..];
270382
self.string = &self.string[..idx];
271383
Some(retstr)
@@ -274,7 +386,13 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
274386

275387
#[inline]
276388
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
277-
Graphemes { string: s, extended: is_extended, cat: None, catb: None }
389+
Graphemes {
390+
string: s,
391+
extended: is_extended,
392+
cat: None,
393+
catb: None,
394+
regional_count_back: None
395+
}
278396
}
279397

280398
#[inline]

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
//!
4747
//! ```toml
4848
//! [dependencies]
49-
//! unicode-segmentation = "0.1.3"
49+
//! unicode-segmentation = "1.0.0"
5050
//! ```
5151
5252
#![deny(missing_docs, unsafe_code)]
@@ -96,7 +96,7 @@ pub trait UnicodeSegmentation {
9696
/// assert_eq!(&gr1[..], b);
9797
///
9898
/// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
99-
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺🇸🇹"];
99+
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
100100
///
101101
/// assert_eq!(&gr2[..], b);
102102
/// ```

0 commit comments

Comments
 (0)