unicode-rs
diff --git a/‎.travis.yml
Lines changed: 5 additions & 2 deletions b/‎.travis.yml
Lines changed: 5 additions & 2 deletions
diff --git a/‎Cargo.toml
Lines changed: 5 additions & 1 deletion b/‎Cargo.toml
Lines changed: 5 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 6 additions & 1 deletion b/‎README.md
Lines changed: 6 additions & 1 deletion
diff --git a/‎scripts/unicode.py
Lines changed: 120 additions & 3 deletions b/‎scripts/unicode.py
Lines changed: 120 additions & 3 deletions
diff --git a/‎scripts/unicode_gen_breaktests.py
Lines changed: 18 additions & 18 deletions b/‎scripts/unicode_gen_breaktests.py
Lines changed: 18 additions & 18 deletions
diff --git a/‎src/grapheme.rs
Lines changed: 8 additions & 3 deletions b/‎src/grapheme.rs
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/lib.rs
Lines changed: 14 additions & 6 deletions b/‎src/lib.rs
Lines changed: 14 additions & 6 deletions
@@ -1,8 +1,11 @@
 language: rust
 sudo: false
 script:
-  - cargo build --verbose
-  - cargo test --verbose
+  - cargo build --verbose --features no_std
+  - cargo test --verbose --features no_std
+  - cargo clean
+  - cargo build --verbose --features default
+  - cargo test --verbose --features default
   - rustdoc --test README.md -L target/debug -L target/debug/deps
   - cargo doc
 after_success: |
 
@@ -1,7 +1,7 @@
 [package]
 
 name = "unicode-segmentation"
-version = "0.0.1"
+version = "0.1.0"
 authors = ["kwantam <kwantam@gmail.com>"]
 
 homepage = "https://github.com/unicode-rs/unicode-segmentation"
@@ -17,3 +17,7 @@ according to Unicode Standard Annex #29 rules.
 """
 
 exclude = [ "target/*", "Cargo.lock", "scripts/tmp" ]
+
+[features]
+default = []
+no_std = []
@@ -26,12 +26,17 @@ fn main() {
 }
 ```
 
+# features
+
+unicode-segmentation supports a `no_std` feature. This eliminates dependence on std,
+and instead uses equivalent functions from core.
+
 # crates.io
 
 You can use this package in your project by adding the following
 to your `Cargo.toml`:
 
 ```toml
 [dependencies]
-unicode-segmentation = "0.0.1"
+unicode-segmentation = "0.1.0"
 ```
@@ -11,9 +11,11 @@
 # except according to those terms.
 
 # This script uses the following Unicode tables:
+# - DerivedCoreProperties.txt
 # - auxiliary/GraphemeBreakProperty.txt
 # - auxiliary/WordBreakProperty.txt
 # - ReadMe.txt
+# - UnicodeData.txt
 #
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the unicode.rs file into git.
@@ -30,7 +32,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
+// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
 
 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
 '''
@@ -64,6 +66,48 @@ def fetch(f):
         sys.stderr.write("cannot load %s" % f)
         exit(1)
 
+def load_gencats(f):
+    fetch(f)
+    gencats = {}
+
+    udict = {};
+    range_start = -1;
+    for line in fileinput.input(f):
+        data = line.split(';');
+        if len(data) != 15:
+            continue
+        cp = int(data[0], 16);
+        if is_surrogate(cp):
+            continue
+        if range_start >= 0:
+            for i in xrange(range_start, cp):
+                udict[i] = data;
+            range_start = -1;
+        if data[1].endswith(", First>"):
+            range_start = cp;
+            continue;
+        udict[cp] = data;
+
+    for code in udict:
+        [code_org, name, gencat, combine, bidi,
+         decomp, deci, digit, num, mirror,
+         old, iso, upcase, lowcase, titlecase ] = udict[code];
+
+        # place letter in categories as appropriate
+        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
+            if cat not in gencats:
+                gencats[cat] = []
+            gencats[cat].append(code)
+
+    gencats = group_cats(gencats)
+    return gencats
+
+def group_cats(cats):
+    cats_out = {}
+    for cat in cats:
+        cats_out[cat] = group_cat(cats[cat])
+    return cats_out
+
 def group_cat(cat):
     cat_out = []
     letters = sorted(set(cat))
@@ -133,6 +177,11 @@ def load_properties(f, interestingprops):
         if prop not in props:
             props[prop] = []
         props[prop].append((d_lo, d_hi))
+
+    # optimize if possible
+    for prop in props:
+        props[prop] = group_cat(ungroup_cat(props[prop]))
+
     return props
 
 def escape_char(c):
@@ -156,13 +205,71 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
     format_table_content(f, data, 8)
     f.write("\n    ];\n\n")
 
+def emit_util_mod(f):
+    f.write("""
+pub mod util {
+    #[inline]
+    pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
+        #[cfg(feature = "no_std")]
+        use core::cmp::Ordering::{Equal, Less, Greater};
+        #[cfg(feature = "no_std")]
+        use core::slice::SliceExt;
+        
+        #[cfg(not(feature = "no_std"))]
+        use std::cmp::Ordering::{Equal, Less, Greater};
+        r.binary_search_by(|&(lo,hi)| {
+            if lo <= c && c <= hi { Equal }
+            else if hi < c { Less }
+            else { Greater }
+        }).is_ok()
+    }
+
+    #[inline]
+    fn is_alphabetic(c: char) -> bool {
+        match c {
+            'a' ... 'z' | 'A' ... 'Z' => true,
+            c if c > '\x7f' => super::derived_property::Alphabetic(c),
+            _ => false,
+        }
+    }
+
+    #[inline]
+    fn is_numeric(c: char) -> bool {
+        match c {
+            '0' ... '9' => true,
+            c if c > '\x7f' => super::general_category::N(c),
+            _ => false,
+        }
+    }
+
+    #[inline]
+    pub fn is_alphanumeric(c: char) -> bool {
+        is_alphabetic(c) || is_numeric(c)
+    }
+}
+
+""")
+
+def emit_property_module(f, mod, tbl, emit):
+    f.write("mod %s {\n" % mod)
+    for cat in sorted(emit):
+        emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
+        f.write("    #[inline]\n")
+        f.write("    pub fn %s(c: char) -> bool {\n" % cat)
+        f.write("        super::util::bsearch_range_table(c, %s_table)\n" % cat)
+        f.write("    }\n\n")
+    f.write("}\n\n")
+
 def emit_break_module(f, break_table, break_cats, name):
     Name = name.capitalize()
     f.write("""pub mod %s {
+    #[cfg(feature = "no_std")]
     use core::slice::SliceExt;
-    pub use self::%sCat::*;
+    #[cfg(feature = "no_std")]
     use core::result::Result::{Ok, Err};
 
+    pub use self::%sCat::*;
+
     #[allow(non_camel_case_types)]
     #[derive(Clone, Copy, PartialEq, Eq)]
     pub enum %sCat {
@@ -175,7 +282,10 @@ def emit_break_module(f, break_table, break_cats, name):
     f.write("""    }
 
     fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
+        #[cfg(feature = "no_std")]
         use core::cmp::Ordering::{Equal, Less, Greater};
+        #[cfg(not(feature = "no_std"))]
+        use std::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
             else if hi < c { Less }
@@ -217,9 +327,16 @@ def emit_break_module(f, break_table, break_cats, name):
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-segmentation is based on.
 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
-
 """ % unicode_version)
 
+        gencats = load_gencats("UnicodeData.txt")
+        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
+
+        emit_util_mod(rf)
+        for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
+                                  ("derived_property", derived, ["Alphabetic"]):
+            emit_property_module(rf, name, cat, pfuns)
+
         ### grapheme cluster module
         # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
         grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
 
@@ -12,8 +12,8 @@
 # except according to those terms.
 
 # This script uses the following Unicode tables:
-# - GraphemeBreakTest.txt
-# - WordBreakTest.txt
+# - auxiliary/GraphemeBreakTest.txt
+# - auxiliary/WordBreakTest.txt
 #
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the unicode.rs file into git.
@@ -139,7 +139,7 @@ def showfun(x):
     outstr += '])'
     return outstr
 
-def create_grapheme_data():
+def create_grapheme_data(f):
     # rules 9.1 and 9.2 are for extended graphemes only
     optsplits = ['9.1','9.2']
     d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
@@ -169,15 +169,14 @@ def create_grapheme_data():
         else:
             test_diff.append((allchars, extgraphs, c))
 
-    stype = "&[(&str, &[&str])]"
-    dtype = "&[(&str, &[&str], &[&str])]"
-    with open("graph_tests.rs", "w") as rf:
-        rf.write("    // official Unicode test data\n")
-        rf.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
-        unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False)
-        unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False)
+    stype = "&'static [(&'static str, &'static [&'static str])]"
+    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
+    unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
+    unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
 
-def create_words_data():
+def create_words_data(f):
     d = load_test_data("auxiliary/WordBreakTest.txt")
 
     test = []
@@ -186,12 +185,13 @@ def create_words_data():
         allchars = [cn for s in c for cn in s]
         test.append((allchars, c))
 
-    wtype = "&[(&str, &[&str])]"
-    with open("word_tests.rs", "w") as rf:
-        rf.write("    // official Unicode test data\n")
-        rf.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
-        unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False)
+    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
+    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
 
 if __name__ == "__main__":
-    create_grapheme_data()
-    create_words_data()
+    with open("testdata.rs", "w") as rf:
+        rf.write(unicode.preamble)
+        create_grapheme_data(rf)
+        create_words_data(rf)
@@ -8,10 +8,15 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+#[cfg(feature = "no_std")]
 use core::prelude::*;
 
+#[cfg(feature = "no_std")]
 use core::cmp;
 
+#[cfg(not(feature = "no_std"))]
+use std::cmp;
+
 use tables::grapheme::GraphemeCat;
 
 /// External iterator for grapheme clusters and byte offsets.
@@ -108,7 +113,7 @@ impl<'a> Iterator for Graphemes<'a> {
                 Start if '\r' == ch => {
                     let slen = self.string.len();
                     let nidx = idx + 1;
-                    if nidx != slen && self.string.char_at(nidx) == '\n' {
+                    if nidx != slen && self.string[nidx..].chars().next().unwrap() == '\n' {
                         idx = nidx;             // rule GB3
                     }
                     break;                      // rule GB4
@@ -160,7 +165,7 @@ impl<'a> Iterator for Graphemes<'a> {
         }
 
         self.cat = if take_curr {
-            idx = idx + self.string.char_at(idx).len_utf8();
+            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
             None
         } else {
             Some(cat)
@@ -209,7 +214,7 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
             // HangulLVT means the letter to the right is T
             state = match state {
                 Start if '\n' == ch => {
-                    if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
+                    if idx > 0 && '\r' == self.string[..idx].chars().next_back().unwrap() {
                         idx -= 1;       // rule GB3
                     }
                     break;              // rule GB4
 
@@ -39,20 +39,26 @@
 //! You can use this package in your project by adding the following
 //! to your `Cargo.toml`:
 //!
+//! # features
+//!
+//! unicode-segmentation supports a `no_std` feature. This eliminates dependence on std,
+//! and instead uses equivalent functions from core.
+//!
 //! ```toml
 //! [dependencies]
-//! unicode-segmentation = "0.0.1"
+//! unicode-segmentation = "0.1.0"
 //! ```
 
 #![deny(missing_docs, unsafe_code)]
-#![feature(no_std, core, unicode)]
-#![no_std]
-#![cfg_attr(test, feature(str_char, collections))]
 
+#![cfg_attr(feature = "no_std", no_std)]
+#![cfg_attr(feature = "no_std", feature(no_std, core))]
+
+#[cfg(feature = "no_std")]
+#[macro_use]
 extern crate core;
-extern crate unicode;
 
-#[cfg(test)]
+#[cfg(all(test, feature = "no_std"))]
 #[macro_use]
 extern crate std;
 
@@ -66,6 +72,8 @@ mod word;
 
 #[cfg(test)]
 mod test;
+#[cfg(test)]
+mod testdata;
 
 /// Methods for segmenting strings according to
 /// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).