Skip to content

Commit 13ddba0

Browse files
committed
tests passing
1 parent b378be6 commit 13ddba0

File tree

4 files changed

+2338
-2
lines changed

4 files changed

+2338
-2
lines changed

src/grapheme.rs

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use core::prelude::*;
12+
13+
use core::cmp;
14+
15+
use tables::grapheme::GraphemeCat;
16+
17+
/// External iterator for grapheme clusters and byte offsets.
18+
#[derive(Clone)]
19+
pub struct GraphemeIndices<'a> {
20+
start_offset: usize,
21+
iter: Graphemes<'a>,
22+
}
23+
24+
impl<'a> Iterator for GraphemeIndices<'a> {
25+
type Item = (usize, &'a str);
26+
27+
#[inline]
28+
fn next(&mut self) -> Option<(usize, &'a str)> {
29+
self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
30+
}
31+
32+
#[inline]
33+
fn size_hint(&self) -> (usize, Option<usize>) {
34+
self.iter.size_hint()
35+
}
36+
}
37+
38+
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
39+
#[inline]
40+
fn next_back(&mut self) -> Option<(usize, &'a str)> {
41+
self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
42+
}
43+
}
44+
45+
/// External iterator for a string's
46+
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
47+
#[derive(Clone)]
48+
pub struct Graphemes<'a> {
49+
string: &'a str,
50+
extended: bool,
51+
cat: Option<GraphemeCat>,
52+
catb: Option<GraphemeCat>,
53+
}
54+
55+
// state machine for cluster boundary rules
56+
#[derive(PartialEq,Eq)]
57+
enum GraphemeState {
58+
Start,
59+
FindExtend,
60+
HangulL,
61+
HangulLV,
62+
HangulLVT,
63+
Regional,
64+
}
65+
66+
impl<'a> Iterator for Graphemes<'a> {
67+
type Item = &'a str;
68+
69+
#[inline]
70+
fn size_hint(&self) -> (usize, Option<usize>) {
71+
let slen = self.string.len();
72+
(cmp::min(slen, 1), Some(slen))
73+
}
74+
75+
#[inline]
76+
fn next(&mut self) -> Option<&'a str> {
77+
use self::GraphemeState::*;
78+
use tables::grapheme as gr;
79+
if self.string.len() == 0 {
80+
return None;
81+
}
82+
83+
let mut take_curr = true;
84+
let mut idx = 0;
85+
let mut state = Start;
86+
let mut cat = gr::GC_Any;
87+
for (curr, ch) in self.string.char_indices() {
88+
idx = curr;
89+
90+
// retrieve cached category, if any
91+
// We do this because most of the time we would end up
92+
// looking up each character twice.
93+
cat = match self.cat {
94+
None => gr::grapheme_category(ch),
95+
_ => self.cat.take().unwrap()
96+
};
97+
98+
if match cat {
99+
gr::GC_Extend => true,
100+
gr::GC_SpacingMark if self.extended => true,
101+
_ => false
102+
} {
103+
state = FindExtend; // rule GB9/GB9a
104+
continue;
105+
}
106+
107+
state = match state {
108+
Start if '\r' == ch => {
109+
let slen = self.string.len();
110+
let nidx = idx + 1;
111+
if nidx != slen && self.string.char_at(nidx) == '\n' {
112+
idx = nidx; // rule GB3
113+
}
114+
break; // rule GB4
115+
}
116+
Start => match cat {
117+
gr::GC_Control => break,
118+
gr::GC_L => HangulL,
119+
gr::GC_LV | gr::GC_V => HangulLV,
120+
gr::GC_LVT | gr::GC_T => HangulLVT,
121+
gr::GC_Regional_Indicator => Regional,
122+
_ => FindExtend
123+
},
124+
FindExtend => { // found non-extending when looking for extending
125+
take_curr = false;
126+
break;
127+
},
128+
HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
129+
gr::GC_L => continue,
130+
gr::GC_LV | gr::GC_V => HangulLV,
131+
gr::GC_LVT => HangulLVT,
132+
_ => {
133+
take_curr = false;
134+
break;
135+
}
136+
},
137+
HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
138+
gr::GC_V => continue,
139+
gr::GC_T => HangulLVT,
140+
_ => {
141+
take_curr = false;
142+
break;
143+
}
144+
},
145+
HangulLVT => match cat { // rule GB8: (LVT|T) x T
146+
gr::GC_T => continue,
147+
_ => {
148+
take_curr = false;
149+
break;
150+
}
151+
},
152+
Regional => match cat { // rule GB8a
153+
gr::GC_Regional_Indicator => continue,
154+
_ => {
155+
take_curr = false;
156+
break;
157+
}
158+
}
159+
}
160+
}
161+
162+
self.cat = if take_curr {
163+
idx = idx + self.string.char_at(idx).len_utf8();
164+
None
165+
} else {
166+
Some(cat)
167+
};
168+
169+
let retstr = &self.string[..idx];
170+
self.string = &self.string[idx..];
171+
Some(retstr)
172+
}
173+
}
174+
175+
impl<'a> DoubleEndedIterator for Graphemes<'a> {
176+
#[inline]
177+
fn next_back(&mut self) -> Option<&'a str> {
178+
use self::GraphemeState::*;
179+
use tables::grapheme as gr;
180+
if self.string.len() == 0 {
181+
return None;
182+
}
183+
184+
let mut take_curr = true;
185+
let mut idx = self.string.len();
186+
let mut previdx = idx;
187+
let mut state = Start;
188+
let mut cat = gr::GC_Any;
189+
for (curr, ch) in self.string.char_indices().rev() {
190+
previdx = idx;
191+
idx = curr;
192+
193+
// cached category, if any
194+
cat = match self.catb {
195+
None => gr::grapheme_category(ch),
196+
_ => self.catb.take().unwrap()
197+
};
198+
199+
// a matching state machine that runs *backwards* across an input string
200+
// note that this has some implications for the Hangul matching, since
201+
// we now need to know what the rightward letter is:
202+
//
203+
// Right to left, we have:
204+
// L x L
205+
// V x (L|V|LV)
206+
// T x (V|T|LV|LVT)
207+
// HangulL means the letter to the right is L
208+
// HangulLV means the letter to the right is V
209+
// HangulLVT means the letter to the right is T
210+
state = match state {
211+
Start if '\n' == ch => {
212+
if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
213+
idx -= 1; // rule GB3
214+
}
215+
break; // rule GB4
216+
},
217+
Start | FindExtend => match cat {
218+
gr::GC_Extend => FindExtend,
219+
gr::GC_SpacingMark if self.extended => FindExtend,
220+
gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
221+
gr::GC_V => HangulLV,
222+
gr::GC_T => HangulLVT,
223+
gr::GC_Regional_Indicator => Regional,
224+
gr::GC_Control => {
225+
take_curr = Start == state;
226+
break;
227+
},
228+
_ => break
229+
},
230+
HangulL => match cat { // char to right is an L
231+
gr::GC_L => continue, // L x L is the only legal match
232+
_ => {
233+
take_curr = false;
234+
break;
235+
}
236+
},
237+
HangulLV => match cat { // char to right is a V
238+
gr::GC_V => continue, // V x V, right char is still V
239+
gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
240+
_ => {
241+
take_curr = false;
242+
break;
243+
}
244+
},
245+
HangulLVT => match cat { // char to right is a T
246+
gr::GC_T => continue, // T x T, right char is still T
247+
gr::GC_V => HangulLV, // V x T, right char is now V
248+
gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
249+
_ => {
250+
take_curr = false;
251+
break;
252+
}
253+
},
254+
Regional => match cat { // rule GB8a
255+
gr::GC_Regional_Indicator => continue,
256+
_ => {
257+
take_curr = false;
258+
break;
259+
}
260+
}
261+
}
262+
}
263+
264+
self.catb = if take_curr {
265+
None
266+
} else {
267+
idx = previdx;
268+
Some(cat)
269+
};
270+
271+
let retstr = &self.string[idx..];
272+
self.string = &self.string[..idx];
273+
Some(retstr)
274+
}
275+
}
276+
277+
#[inline]
278+
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
279+
Graphemes { string: s, extended: is_extended, cat: None, catb: None }
280+
}
281+
282+
#[inline]
283+
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
284+
GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
285+
}

0 commit comments

Comments
 (0)