Skip to content

Commit 40f9ba6

Browse files
committed
Move lookups into own module
1 parent 08996fa commit 40f9ba6

File tree

6 files changed

+96
-83
lines changed

6 files changed

+96
-83
lines changed

src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ pub use stream_safe::StreamSafe;
6565
use std::str::Chars;
6666

6767
mod decompose;
68+
mod lookups;
6869
mod normalize;
6970
mod perfect_hash;
7071
mod recompose;
@@ -81,7 +82,7 @@ mod normalization_tests;
8182
pub mod char {
8283
pub use normalize::{decompose_canonical, decompose_compatible, compose};
8384

84-
pub use perfect_hash::{canonical_combining_class, is_combining_mark};
85+
pub use lookups::{canonical_combining_class, is_combining_mark};
8586
}
8687

8788

src/lookups.rs

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
//! Lookups of unicode properties using minimal perfect hashing.
12+
13+
use perfect_hash::mph_lookup;
14+
use tables::*;
15+
16+
/// Look up the canonical combining class for a codepoint.
17+
///
18+
/// The value returned is as defined in the Unicode Character Database.
19+
pub fn canonical_combining_class(c: char) -> u8 {
20+
mph_lookup(c.into(), CANONICAL_COMBINING_CLASS_SALT, CANONICAL_COMBINING_CLASS_KV,
21+
u8_lookup_fk, u8_lookup_fv, 0)
22+
}
23+
24+
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
25+
if c1 < '\u{10000}' && c2 < '\u{10000}' {
26+
mph_lookup((c1 as u32) << 16 | (c2 as u32),
27+
COMPOSITION_TABLE_SALT, COMPOSITION_TABLE_KV,
28+
pair_lookup_fk, pair_lookup_fv_opt, None)
29+
} else {
30+
composition_table_astral(c1, c2)
31+
}
32+
}
33+
34+
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
35+
mph_lookup(c.into(), CANONICAL_DECOMPOSED_SALT, CANONICAL_DECOMPOSED_KV,
36+
pair_lookup_fk, pair_lookup_fv_opt, None)
37+
}
38+
39+
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
40+
mph_lookup(c.into(), COMPATIBILITY_DECOMPOSED_SALT, COMPATIBILITY_DECOMPOSED_KV,
41+
pair_lookup_fk, pair_lookup_fv_opt, None)
42+
}
43+
44+
/// Return whether the given character is a combining mark (`General_Category=Mark`)
45+
pub fn is_combining_mark(c: char) -> bool {
46+
mph_lookup(c.into(), COMBINING_MARK_SALT, COMBINING_MARK_KV,
47+
bool_lookup_fk, bool_lookup_fv, false)
48+
}
49+
50+
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
51+
mph_lookup(c.into(), TRAILING_NONSTARTERS_SALT, TRAILING_NONSTARTERS_KV,
52+
u8_lookup_fk, u8_lookup_fv, 0) as usize
53+
}
54+
55+
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
56+
#[inline]
57+
fn u8_lookup_fk(kv: u32) -> u32 {
58+
kv >> 8
59+
}
60+
61+
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
62+
#[inline]
63+
fn u8_lookup_fv(kv: u32) -> u8 {
64+
(kv & 0xff) as u8
65+
}
66+
67+
/// Extract the key for a boolean lookup.
68+
#[inline]
69+
fn bool_lookup_fk(kv: u32) -> u32 {
70+
kv
71+
}
72+
73+
/// Extract the value for a boolean lookup.
74+
#[inline]
75+
fn bool_lookup_fv(_kv: u32) -> bool {
76+
true
77+
}
78+
79+
/// Extract the key in a pair.
80+
#[inline]
81+
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
82+
kv.0
83+
}
84+
85+
/// Extract the value in a pair, returning an option.
86+
#[inline]
87+
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
88+
Some(kv.1)
89+
}

src/normalize.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use std::char;
1313
use std::ops::FnMut;
14-
use perfect_hash::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed};
14+
use lookups::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed};
1515

1616
/// Compute canonical Unicode decomposition for character.
1717
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)

src/perfect_hash.rs

Lines changed: 1 addition & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010

1111
//! Support for lookups based on minimal perfect hashing.
1212
13-
use tables::*;
14-
1513
// This function is based on multiplication being fast and is "good enough". Also
1614
// it can share some work between the unsalted and salted versions.
1715
#[inline]
@@ -29,7 +27,7 @@ fn my_hash(key: u32, salt: u32, n: usize) -> usize {
2927
/// The hash function doesn't have to be very good, just good enough that the
3028
/// resulting map is unique.
3129
#[inline]
32-
fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
30+
pub(crate) fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
3331
default: V) -> V
3432
where KV: Copy, FK: Fn(KV) -> u32, FV: Fn(KV) -> V
3533
{
@@ -41,78 +39,3 @@ fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
4139
default
4240
}
4341
}
44-
45-
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
46-
#[inline]
47-
fn u8_lookup_fk(kv: u32) -> u32 {
48-
kv >> 8
49-
}
50-
51-
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
52-
#[inline]
53-
fn u8_lookup_fv(kv: u32) -> u8 {
54-
(kv & 0xff) as u8
55-
}
56-
57-
/// Extract the key for a boolean lookup.
58-
#[inline]
59-
fn bool_lookup_fk(kv: u32) -> u32 {
60-
kv
61-
}
62-
63-
/// Extract the value for a boolean lookup.
64-
#[inline]
65-
fn bool_lookup_fv(_kv: u32) -> bool {
66-
true
67-
}
68-
69-
/// Extract the key in a pair.
70-
#[inline]
71-
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
72-
kv.0
73-
}
74-
75-
/// Extract the value in a pair, returning an option.
76-
#[inline]
77-
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
78-
Some(kv.1)
79-
}
80-
81-
/// Look up the canonical combining class for a codepoint.
82-
///
83-
/// The value returned is as defined in the Unicode Character Database.
84-
pub fn canonical_combining_class(c: char) -> u8 {
85-
mph_lookup(c.into(), CANONICAL_COMBINING_CLASS_SALT, CANONICAL_COMBINING_CLASS_KV,
86-
u8_lookup_fk, u8_lookup_fv, 0)
87-
}
88-
89-
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
90-
if c1 < '\u{10000}' && c2 < '\u{10000}' {
91-
mph_lookup((c1 as u32) << 16 | (c2 as u32),
92-
COMPOSITION_TABLE_SALT, COMPOSITION_TABLE_KV,
93-
pair_lookup_fk, pair_lookup_fv_opt, None)
94-
} else {
95-
composition_table_astral(c1, c2)
96-
}
97-
}
98-
99-
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
100-
mph_lookup(c.into(), CANONICAL_DECOMPOSED_SALT, CANONICAL_DECOMPOSED_KV,
101-
pair_lookup_fk, pair_lookup_fv_opt, None)
102-
}
103-
104-
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
105-
mph_lookup(c.into(), COMPATIBILITY_DECOMPOSED_SALT, COMPATIBILITY_DECOMPOSED_KV,
106-
pair_lookup_fk, pair_lookup_fv_opt, None)
107-
}
108-
109-
/// Return whether the given character is a combining mark (`General_Category=Mark`)
110-
pub fn is_combining_mark(c: char) -> bool {
111-
mph_lookup(c.into(), COMBINING_MARK_SALT, COMBINING_MARK_KV,
112-
bool_lookup_fk, bool_lookup_fv, false)
113-
}
114-
115-
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
116-
mph_lookup(c.into(), TRAILING_NONSTARTERS_SALT, TRAILING_NONSTARTERS_KV,
117-
u8_lookup_fk, u8_lookup_fv, 0) as usize
118-
}

src/quick_check.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use UnicodeNormalization;
2-
use perfect_hash::canonical_combining_class;
2+
use lookups::canonical_combining_class;
33
use stream_safe;
44
use tables;
55

src/stream_safe.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use normalize::{
22
hangul_decomposition_length,
33
is_hangul_syllable,
44
};
5-
use perfect_hash::{
5+
use lookups::{
66
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
77
stream_safe_trailing_nonstarters,
88
};
@@ -113,7 +113,7 @@ mod tests {
113113
use std::char;
114114
use normalization_tests::NORMALIZATION_TESTS;
115115
use normalize::decompose_compatible;
116-
use perfect_hash::canonical_combining_class;
116+
use lookups::canonical_combining_class;
117117

118118
fn stream_safe(s: &str) -> String {
119119
StreamSafe::new(s.chars()).collect()

0 commit comments

Comments
 (0)