Skip to content

Commit 93b0d56

Browse files
committed
Added forward iterator for unicode sentences
Passes all tests in the examples provided here: http://www.unicode.org/Public/9.0.0/ucd/auxiliary/SentenceBreakTest.txt
1 parent fa10dd3 commit 93b0d56

File tree

4 files changed

+376
-2
lines changed

4 files changed

+376
-2
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
1212
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
1313
readme = "README.md"
1414
description = """
15-
This crate provides Grapheme Cluster and Word boundaries
15+
This crate provides Grapheme Cluster, Word and Sentence boundaries
1616
according to Unicode Standard Annex #29 rules.
1717
"""
1818

src/lib.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
11+
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
1212
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
1313
//!
1414
//! ```rust
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
6767
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
6868
pub use tables::UNICODE_VERSION;
6969
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
70+
pub use sentence::{USentenceBounds};
7071

7172
mod grapheme;
7273
mod tables;
7374
mod word;
75+
mod sentence;
7476

7577
#[cfg(test)]
7678
mod test;
@@ -174,6 +176,12 @@ pub trait UnicodeSegmentation {
174176
/// assert_eq!(&swi1[..], b);
175177
/// ```
176178
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
179+
180+
/// Returns an iterator over substrings of `self` separated on
181+
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
182+
///
183+
/// The concatenation of the substrings returned by this function is just the original string.
184+
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
177185
}
178186

179187
impl UnicodeSegmentation for str {
@@ -201,4 +209,9 @@ impl UnicodeSegmentation for str {
201209
fn split_word_bound_indices(&self) -> UWordBoundIndices {
202210
word::new_word_bound_indices(self)
203211
}
212+
213+
#[inline]
214+
fn split_sentence_bounds(&self) -> USentenceBounds {
215+
sentence::new_sentence_bounds(self)
216+
}
204217
}

src/sentence.rs

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use core::cmp;
12+
13+
// All of the logic for forward iteration over sentences
14+
mod fwd {
15+
use tables::sentence::SentenceCat;
16+
use core::cmp;
17+
18+
#[derive(Clone, Copy, PartialEq, Eq)]
19+
enum StatePart {
20+
Sot,
21+
Eot,
22+
Other,
23+
CR,
24+
LF,
25+
Sep,
26+
ATerm,
27+
UpperLower,
28+
ClosePlus,
29+
SpPlus,
30+
STerm
31+
}
32+
33+
#[derive(Clone, PartialEq, Eq)]
34+
struct SentenceBreaksState(pub [StatePart; 4]);
35+
36+
const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
37+
StatePart::Sot,
38+
StatePart::Sot,
39+
StatePart::Sot,
40+
StatePart::Sot
41+
]);
42+
43+
pub struct SentenceBreaks<'a> {
44+
pub string: &'a str,
45+
pos: usize,
46+
state: SentenceBreaksState
47+
}
48+
49+
impl SentenceBreaksState {
50+
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
51+
let &SentenceBreaksState(parts) = self;
52+
let parts = match (parts[3], cat) {
53+
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
54+
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
55+
_ => [
56+
parts[1],
57+
parts[2],
58+
parts[3],
59+
match cat {
60+
SentenceCat::SC_CR => StatePart::CR,
61+
SentenceCat::SC_LF => StatePart::LF,
62+
SentenceCat::SC_Sep => StatePart::Sep,
63+
SentenceCat::SC_ATerm => StatePart::ATerm,
64+
SentenceCat::SC_Upper |
65+
SentenceCat::SC_Lower => StatePart::UpperLower,
66+
SentenceCat::SC_Close => StatePart::ClosePlus,
67+
SentenceCat::SC_Sp => StatePart::SpPlus,
68+
SentenceCat::SC_STerm => StatePart::STerm,
69+
_ => StatePart::Other
70+
}
71+
]
72+
};
73+
SentenceBreaksState(parts)
74+
}
75+
76+
fn end(&self) -> SentenceBreaksState {
77+
let &SentenceBreaksState(parts) = self;
78+
SentenceBreaksState([
79+
parts[1],
80+
parts[2],
81+
parts[3],
82+
StatePart::Eot
83+
])
84+
}
85+
86+
fn match1(&self, part: StatePart) -> bool {
87+
let &SentenceBreaksState(parts) = self;
88+
part == parts[3]
89+
}
90+
91+
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
92+
let &SentenceBreaksState(parts) = self;
93+
part1 == parts[2] && part2 == parts[3]
94+
}
95+
}
96+
97+
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
98+
let aterm_part = {
99+
// ATerm Close* Sp*
100+
let &SentenceBreaksState(parts) = state;
101+
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
102+
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
103+
parts[idx]
104+
};
105+
106+
if aterm_part == StatePart::ATerm {
107+
use tables::sentence as se;
108+
109+
for next_char in ahead.chars() {
110+
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
111+
match se::sentence_category(next_char) {
112+
se::SC_Lower => return true,
113+
se::SC_OLetter |
114+
se::SC_Upper |
115+
se::SC_Sep | se::SC_CR | se::SC_LF |
116+
se::SC_STerm | se::SC_ATerm => return false,
117+
_ => continue
118+
}
119+
}
120+
}
121+
122+
false
123+
}
124+
125+
fn match_sb8a(state: &SentenceBreaksState) -> bool {
126+
// SATerm Close* Sp*
127+
let &SentenceBreaksState(parts) = state;
128+
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
129+
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
130+
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
131+
}
132+
133+
fn match_sb9(state: &SentenceBreaksState) -> bool {
134+
// SATerm Close*
135+
let &SentenceBreaksState(parts) = state;
136+
let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
137+
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
138+
}
139+
140+
fn match_sb11(state: &SentenceBreaksState) -> bool {
141+
// SATerm Close* Sp* ParaSep?
142+
let &SentenceBreaksState(parts) = state;
143+
let mut idx = match parts[3] {
144+
StatePart::Sep |
145+
StatePart::CR |
146+
StatePart::LF => 2,
147+
_ => 3
148+
};
149+
150+
if parts[idx] == StatePart::SpPlus { idx -= 1 }
151+
if parts[idx] == StatePart::ClosePlus { idx -= 1}
152+
153+
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
154+
}
155+
156+
impl<'a> Iterator for SentenceBreaks<'a> {
157+
// Returns the index of the character which follows a break
158+
type Item = usize;
159+
160+
#[inline]
161+
fn size_hint(&self) -> (usize, Option<usize>) {
162+
let slen = self.string.len();
163+
// A sentence could be one character
164+
(cmp::min(slen, 2), Some(slen + 1))
165+
}
166+
167+
#[inline]
168+
fn next(&mut self) -> Option<usize> {
169+
use tables::sentence as se;
170+
171+
for next_char in self.string[self.pos..].chars() {
172+
let position_before = self.pos;
173+
let state_before = self.state.clone();
174+
175+
let next_cat = se::sentence_category(next_char);
176+
177+
self.pos += next_char.len_utf8();
178+
self.state = self.state.next(next_cat);
179+
180+
match next_cat {
181+
// SB1
182+
_ if state_before.match1(StatePart::Sot) =>
183+
return Some(position_before),
184+
185+
// SB3
186+
SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
187+
continue,
188+
189+
// SB4
190+
_ if state_before.match1(StatePart::Sep)
191+
|| state_before.match1(StatePart::CR)
192+
|| state_before.match1(StatePart::LF)
193+
=> return Some(position_before),
194+
195+
// SB5
196+
SentenceCat::SC_Extend |
197+
SentenceCat::SC_Format => self.state = state_before,
198+
199+
// SB6
200+
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
201+
continue,
202+
203+
// SB7
204+
SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
205+
continue,
206+
207+
// SB8
208+
_ if match_sb8(&state_before, &self.string[position_before..]) =>
209+
continue,
210+
211+
// SB8a
212+
SentenceCat::SC_SContinue |
213+
SentenceCat::SC_STerm |
214+
SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
215+
continue,
216+
217+
// SB9
218+
SentenceCat::SC_Close |
219+
SentenceCat::SC_Sp |
220+
SentenceCat::SC_Sep |
221+
SentenceCat::SC_CR |
222+
SentenceCat::SC_LF if match_sb9(&state_before) =>
223+
continue,
224+
225+
// SB10
226+
SentenceCat::SC_Sp |
227+
SentenceCat::SC_Sep |
228+
SentenceCat::SC_CR |
229+
SentenceCat::SC_LF if match_sb8a(&state_before) =>
230+
continue,
231+
232+
// SB11
233+
_ if match_sb11(&state_before) =>
234+
return Some(position_before),
235+
236+
// SB998
237+
_ => continue
238+
}
239+
}
240+
241+
// SB2
242+
if self.state.match1(StatePart::Sot) {
243+
None
244+
} else if self.state.match1(StatePart::Eot) {
245+
None
246+
} else {
247+
self.state = self.state.end();
248+
Some(self.pos)
249+
}
250+
}
251+
}
252+
253+
pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
254+
SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
255+
}
256+
257+
}
258+
259+
/// External iterator for a string's
260+
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
261+
pub struct USentenceBounds<'a> {
262+
iter: fwd::SentenceBreaks<'a>,
263+
sentence_start: Option<usize>
264+
}
265+
266+
#[inline]
267+
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
268+
USentenceBounds {
269+
iter: fwd::new_sentence_breaks(source),
270+
sentence_start: None
271+
}
272+
}
273+
274+
impl<'a> Iterator for USentenceBounds<'a> {
275+
type Item = &'a str;
276+
277+
#[inline]
278+
fn size_hint(&self) -> (usize, Option<usize>) {
279+
let (lower, upper) = self.iter.size_hint();
280+
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
281+
}
282+
283+
#[inline]
284+
fn next(&mut self) -> Option<&'a str> {
285+
if self.sentence_start == None {
286+
if let Some(start_pos) = self.iter.next() {
287+
self.sentence_start = Some(start_pos)
288+
} else {
289+
return None
290+
}
291+
}
292+
293+
if let Some(break_pos) = self.iter.next() {
294+
let start_pos = self.sentence_start.unwrap();
295+
let sentence = &self.iter.string[start_pos..break_pos];
296+
self.sentence_start = Some(break_pos);
297+
Some(sentence)
298+
} else {
299+
None
300+
}
301+
}
302+
}

0 commit comments

Comments
 (0)