Skip to content

Commit 8b0f1e0

Browse files
committed
make no_std a feature; clear out feature deps; bump version number
1 parent 4b4769a commit 8b0f1e0

File tree

11 files changed

+2105
-1749
lines changed

11 files changed

+2105
-1749
lines changed

.travis.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
language: rust
22
sudo: false
33
script:
4-
- cargo build --verbose
5-
- cargo test --verbose
4+
- cargo build --verbose --features no_std
5+
- cargo test --verbose --features no_std
6+
- cargo clean
7+
- cargo build --verbose --features default
8+
- cargo test --verbose --features default
69
- rustdoc --test README.md -L target/debug -L target/debug/deps
710
- cargo doc
811
after_success: |

Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "unicode-segmentation"
4-
version = "0.0.1"
4+
version = "0.1.0"
55
authors = ["kwantam <kwantam@gmail.com>"]
66

77
homepage = "https://github.com/unicode-rs/unicode-segmentation"
@@ -17,3 +17,7 @@ according to Unicode Standard Annex #29 rules.
1717
"""
1818

1919
exclude = [ "target/*", "Cargo.lock", "scripts/tmp" ]
20+
21+
[features]
22+
default = []
23+
no_std = []

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,17 @@ fn main() {
2626
}
2727
```
2828

29+
# features
30+
31+
unicode-segmentation supports a `no_std` feature. This eliminates dependence on std,
32+
and instead uses equivalent functions from core.
33+
2934
# crates.io
3035

3136
You can use this package in your project by adding the following
3237
to your `Cargo.toml`:
3338

3439
```toml
3540
[dependencies]
36-
unicode-segmentation = "0.0.1"
41+
unicode-segmentation = "0.1.0"
3742
```

scripts/unicode.py

Lines changed: 120 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
# except according to those terms.
1212

1313
# This script uses the following Unicode tables:
14+
# - DerivedCoreProperties.txt
1415
# - auxiliary/GraphemeBreakProperty.txt
1516
# - auxiliary/WordBreakProperty.txt
1617
# - ReadMe.txt
18+
# - UnicodeData.txt
1719
#
1820
# Since this should not require frequent updates, we just store this
1921
# out-of-line and check the unicode.rs file into git.
@@ -30,7 +32,7 @@
3032
// option. This file may not be copied, modified, or distributed
3133
// except according to those terms.
3234
33-
// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
35+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3436
3537
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3638
'''
@@ -64,6 +66,48 @@ def fetch(f):
6466
sys.stderr.write("cannot load %s" % f)
6567
exit(1)
6668

69+
def load_gencats(f):
70+
fetch(f)
71+
gencats = {}
72+
73+
udict = {};
74+
range_start = -1;
75+
for line in fileinput.input(f):
76+
data = line.split(';');
77+
if len(data) != 15:
78+
continue
79+
cp = int(data[0], 16);
80+
if is_surrogate(cp):
81+
continue
82+
if range_start >= 0:
83+
for i in xrange(range_start, cp):
84+
udict[i] = data;
85+
range_start = -1;
86+
if data[1].endswith(", First>"):
87+
range_start = cp;
88+
continue;
89+
udict[cp] = data;
90+
91+
for code in udict:
92+
[code_org, name, gencat, combine, bidi,
93+
decomp, deci, digit, num, mirror,
94+
old, iso, upcase, lowcase, titlecase ] = udict[code];
95+
96+
# place letter in categories as appropriate
97+
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
98+
if cat not in gencats:
99+
gencats[cat] = []
100+
gencats[cat].append(code)
101+
102+
gencats = group_cats(gencats)
103+
return gencats
104+
105+
def group_cats(cats):
106+
cats_out = {}
107+
for cat in cats:
108+
cats_out[cat] = group_cat(cats[cat])
109+
return cats_out
110+
67111
def group_cat(cat):
68112
cat_out = []
69113
letters = sorted(set(cat))
@@ -133,6 +177,11 @@ def load_properties(f, interestingprops):
133177
if prop not in props:
134178
props[prop] = []
135179
props[prop].append((d_lo, d_hi))
180+
181+
# optimize if possible
182+
for prop in props:
183+
props[prop] = group_cat(ungroup_cat(props[prop]))
184+
136185
return props
137186

138187
def escape_char(c):
@@ -156,13 +205,71 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
156205
format_table_content(f, data, 8)
157206
f.write("\n ];\n\n")
158207

208+
def emit_util_mod(f):
209+
f.write("""
210+
pub mod util {
211+
#[inline]
212+
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
213+
#[cfg(feature = "no_std")]
214+
use core::cmp::Ordering::{Equal, Less, Greater};
215+
#[cfg(feature = "no_std")]
216+
use core::slice::SliceExt;
217+
218+
#[cfg(not(feature = "no_std"))]
219+
use std::cmp::Ordering::{Equal, Less, Greater};
220+
r.binary_search_by(|&(lo,hi)| {
221+
if lo <= c && c <= hi { Equal }
222+
else if hi < c { Less }
223+
else { Greater }
224+
}).is_ok()
225+
}
226+
227+
#[inline]
228+
fn is_alphabetic(c: char) -> bool {
229+
match c {
230+
'a' ... 'z' | 'A' ... 'Z' => true,
231+
c if c > '\x7f' => super::derived_property::Alphabetic(c),
232+
_ => false,
233+
}
234+
}
235+
236+
#[inline]
237+
fn is_numeric(c: char) -> bool {
238+
match c {
239+
'0' ... '9' => true,
240+
c if c > '\x7f' => super::general_category::N(c),
241+
_ => false,
242+
}
243+
}
244+
245+
#[inline]
246+
pub fn is_alphanumeric(c: char) -> bool {
247+
is_alphabetic(c) || is_numeric(c)
248+
}
249+
}
250+
251+
""")
252+
253+
def emit_property_module(f, mod, tbl, emit):
254+
f.write("mod %s {\n" % mod)
255+
for cat in sorted(emit):
256+
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
257+
f.write(" #[inline]\n")
258+
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
259+
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
260+
f.write(" }\n\n")
261+
f.write("}\n\n")
262+
159263
def emit_break_module(f, break_table, break_cats, name):
160264
Name = name.capitalize()
161265
f.write("""pub mod %s {
266+
#[cfg(feature = "no_std")]
162267
use core::slice::SliceExt;
163-
pub use self::%sCat::*;
268+
#[cfg(feature = "no_std")]
164269
use core::result::Result::{Ok, Err};
165270
271+
pub use self::%sCat::*;
272+
166273
#[allow(non_camel_case_types)]
167274
#[derive(Clone, Copy, PartialEq, Eq)]
168275
pub enum %sCat {
@@ -175,7 +282,10 @@ def emit_break_module(f, break_table, break_cats, name):
175282
f.write(""" }
176283
177284
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
285+
#[cfg(feature = "no_std")]
178286
use core::cmp::Ordering::{Equal, Less, Greater};
287+
#[cfg(not(feature = "no_std"))]
288+
use std::cmp::Ordering::{Equal, Less, Greater};
179289
match r.binary_search_by(|&(lo, hi, _)| {
180290
if lo <= c && c <= hi { Equal }
181291
else if hi < c { Less }
@@ -217,9 +327,16 @@ def emit_break_module(f, break_table, break_cats, name):
217327
/// The version of [Unicode](http://www.unicode.org/)
218328
/// that this version of unicode-segmentation is based on.
219329
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
220-
221330
""" % unicode_version)
222331

332+
gencats = load_gencats("UnicodeData.txt")
333+
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
334+
335+
emit_util_mod(rf)
336+
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
337+
("derived_property", derived, ["Alphabetic"]):
338+
emit_property_module(rf, name, cat, pfuns)
339+
223340
### grapheme cluster module
224341
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
225342
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])

scripts/unicode_gen_breaktests.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
# except according to those terms.
1313

1414
# This script uses the following Unicode tables:
15-
# - GraphemeBreakTest.txt
16-
# - WordBreakTest.txt
15+
# - auxiliary/GraphemeBreakTest.txt
16+
# - auxiliary/WordBreakTest.txt
1717
#
1818
# Since this should not require frequent updates, we just store this
1919
# out-of-line and check the unicode.rs file into git.
@@ -139,7 +139,7 @@ def showfun(x):
139139
outstr += '])'
140140
return outstr
141141

142-
def create_grapheme_data():
142+
def create_grapheme_data(f):
143143
# rules 9.1 and 9.2 are for extended graphemes only
144144
optsplits = ['9.1','9.2']
145145
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
@@ -169,15 +169,14 @@ def create_grapheme_data():
169169
else:
170170
test_diff.append((allchars, extgraphs, c))
171171

172-
stype = "&[(&str, &[&str])]"
173-
dtype = "&[(&str, &[&str], &[&str])]"
174-
with open("graph_tests.rs", "w") as rf:
175-
rf.write(" // official Unicode test data\n")
176-
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
177-
unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False)
178-
unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False)
172+
stype = "&'static [(&'static str, &'static [&'static str])]"
173+
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174+
f.write(" // official Unicode test data\n")
175+
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
176+
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
177+
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
179178

180-
def create_words_data():
179+
def create_words_data(f):
181180
d = load_test_data("auxiliary/WordBreakTest.txt")
182181

183182
test = []
@@ -186,12 +185,13 @@ def create_words_data():
186185
allchars = [cn for s in c for cn in s]
187186
test.append((allchars, c))
188187

189-
wtype = "&[(&str, &[&str])]"
190-
with open("word_tests.rs", "w") as rf:
191-
rf.write(" // official Unicode test data\n")
192-
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
193-
unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False)
188+
wtype = "&'static [(&'static str, &'static [&'static str])]"
189+
f.write(" // official Unicode test data\n")
190+
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
191+
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
194192

195193
if __name__ == "__main__":
196-
create_grapheme_data()
197-
create_words_data()
194+
with open("testdata.rs", "w") as rf:
195+
rf.write(unicode.preamble)
196+
create_grapheme_data(rf)
197+
create_words_data(rf)

src/grapheme.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,15 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11+
#[cfg(feature = "no_std")]
1112
use core::prelude::*;
1213

14+
#[cfg(feature = "no_std")]
1315
use core::cmp;
1416

17+
#[cfg(not(feature = "no_std"))]
18+
use std::cmp;
19+
1520
use tables::grapheme::GraphemeCat;
1621

1722
/// External iterator for grapheme clusters and byte offsets.
@@ -108,7 +113,7 @@ impl<'a> Iterator for Graphemes<'a> {
108113
Start if '\r' == ch => {
109114
let slen = self.string.len();
110115
let nidx = idx + 1;
111-
if nidx != slen && self.string.char_at(nidx) == '\n' {
116+
if nidx != slen && self.string[nidx..].chars().next().unwrap() == '\n' {
112117
idx = nidx; // rule GB3
113118
}
114119
break; // rule GB4
@@ -160,7 +165,7 @@ impl<'a> Iterator for Graphemes<'a> {
160165
}
161166

162167
self.cat = if take_curr {
163-
idx = idx + self.string.char_at(idx).len_utf8();
168+
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
164169
None
165170
} else {
166171
Some(cat)
@@ -209,7 +214,7 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
209214
// HangulLVT means the letter to the right is T
210215
state = match state {
211216
Start if '\n' == ch => {
212-
if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
217+
if idx > 0 && '\r' == self.string[..idx].chars().next_back().unwrap() {
213218
idx -= 1; // rule GB3
214219
}
215220
break; // rule GB4

src/lib.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,26 @@
3939
//! You can use this package in your project by adding the following
4040
//! to your `Cargo.toml`:
4141
//!
42+
//! # features
43+
//!
44+
//! unicode-segmentation supports a `no_std` feature. This eliminates dependence on std,
45+
//! and instead uses equivalent functions from core.
46+
//!
4247
//! ```toml
4348
//! [dependencies]
44-
//! unicode-segmentation = "0.0.1"
49+
//! unicode-segmentation = "0.1.0"
4550
//! ```
4651
4752
#![deny(missing_docs, unsafe_code)]
48-
#![feature(no_std, core, unicode)]
49-
#![no_std]
50-
#![cfg_attr(test, feature(str_char, collections))]
5153

54+
#![cfg_attr(feature = "no_std", no_std)]
55+
#![cfg_attr(feature = "no_std", feature(no_std, core))]
56+
57+
#[cfg(feature = "no_std")]
58+
#[macro_use]
5259
extern crate core;
53-
extern crate unicode;
5460

55-
#[cfg(test)]
61+
#[cfg(all(test, feature = "no_std"))]
5662
#[macro_use]
5763
extern crate std;
5864

@@ -66,6 +72,8 @@ mod word;
6672

6773
#[cfg(test)]
6874
mod test;
75+
#[cfg(test)]
76+
mod testdata;
6977

7078
/// Methods for segmenting strings according to
7179
/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).

0 commit comments

Comments
 (0)