diff --git a/Cargo.toml b/Cargo.toml index 6876866..5817881 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules. exclude = [ "target/*", "Cargo.lock" ] [dependencies] +unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false } std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } core = { version = "1.0", package = "rustc-std-workspace-core", optional = true } compiler_builtins = { version = "0.1", optional = true } diff --git a/src/general_security_profile.rs b/src/general_security_profile.rs new file mode 100644 index 0000000..7db242a --- /dev/null +++ b/src/general_security_profile.rs @@ -0,0 +1,20 @@ +//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile) +//! for identifiers + +use crate::tables::identifier_status as is; + +/// Methods for determining characters not restricted from use for identifiers. +pub trait GeneralSecurityProfile { + /// Returns whether the character is not restricted from use for identifiers. + fn identifier_allowed(self) -> bool; +} + +impl GeneralSecurityProfile for char { + #[inline] + fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) } +} + +impl GeneralSecurityProfile for &'_ str { + #[inline] + fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) } +} diff --git a/src/lib.rs b/src/lib.rs index 75cf4bc..8a2b5b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,7 @@ //! ```rust //! extern crate unicode_security; //! -//! use unicode_security::IdentifierStatusChar; +//! use unicode_security::GeneralSecurityProfile; //! //! fn main() { //! let ch = 'µ'; // U+00B5 MICRO SIGN @@ -55,21 +55,16 @@ extern crate std; #[cfg(feature = "bench")] extern crate test; -use tables::identifier_status as is; pub use tables::UNICODE_VERSION; -mod tables; +pub mod mixed_script; +pub mod general_security_profile; -#[cfg(test)] -mod tests; +pub use mixed_script::MixedScript; +pub use general_security_profile::GeneralSecurityProfile; -/// Methods for determining characters not restricted from use for identifiers. -pub trait UnicodeIdentifierStatus { - /// Returns whether the character is not restricted from use for identifiers. - fn identifier_allowed(self) -> bool; -} +#[rustfmt::skip] +pub(crate) mod tables; -impl UnicodeIdentifierStatus for char { - #[inline] - fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) } -} +#[cfg(test)] +mod tests; diff --git a/src/mixed_script.rs b/src/mixed_script.rs new file mode 100644 index 0000000..0cc12bf --- /dev/null +++ b/src/mixed_script.rs @@ -0,0 +1,129 @@ +//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) + +use unicode_script::{Script, ScriptExtension}; + +/// An Augmented script set, as defined by UTS 39 +/// +/// https://www.unicode.org/reports/tr39/#def-augmented-script-set +pub struct AugmentedScriptSet { + /// The base ScriptExtension value + pub base: ScriptExtension, + /// Han With Bopomofo + pub hanb: bool, + /// Japanese + pub jpan: bool, + /// Korean + pub kore: bool, +} + +impl From for AugmentedScriptSet { + fn from(ext: ScriptExtension) -> Self { + let mut hanb = false; + let mut jpan = false; + let mut kore = false; + + if ext == ScriptExtension::Single(Script::Common) || + ext == ScriptExtension::Single(Script::Inherited) || + ext.contains_script(Script::Han) { + hanb = true; + jpan = true; + kore = true; + } else { + if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { + jpan = true; + } + + if ext.contains_script(Script::Hangul) { + kore = true; + } + + if ext.contains_script(Script::Bopomofo) { + hanb = true; + } + } + Self { + base: ext, + hanb, jpan, kore + } + } +} + +impl From for AugmentedScriptSet { + fn from(c: char) -> Self { + AugmentedScriptSet::for_char(c) + } +} + +impl From<&'_ str> for AugmentedScriptSet { + fn from(s: &'_ str) -> Self { + AugmentedScriptSet::for_str(s) + } +} + +impl Default for AugmentedScriptSet { + fn default() -> Self { + AugmentedScriptSet { + base: ScriptExtension::Single(Script::Common), + hanb: true, + jpan: true, + kore: true, + } + } +} + +impl AugmentedScriptSet { + /// Intersect this set with another + pub fn intersect(mut self, other: Self) -> Self { + self.base = self.base.intersect(other.base); + self.hanb = self.hanb && other.hanb; + self.jpan = self.jpan && other.jpan; + self.kore = self.kore && other.kore; + self + } + + /// Check if the set is empty + pub fn is_empty(&self) -> bool { + self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore + } + + /// Check if the set is "All" (Common or Inherited) + pub fn is_all(&self) -> bool { + self.base == ScriptExtension::Single(Script::Common) || + self.base == ScriptExtension::Single(Script::Inherited) + } + + /// Construct an AugmentedScriptSet for a given character + pub fn for_char(c: char) -> Self { + ScriptExtension::from(c).into() + } + + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string + pub fn for_str(s: &str) -> Self { + let mut set = AugmentedScriptSet::default(); + for ch in s.chars() { + set = set.intersect(ch.into()) + } + set + } +} + +/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) +pub trait MixedScript { + /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) + /// + /// Note that a single-script string may still contain multiple Script properties! + fn is_single_script(self) -> bool; + + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string + fn resolve_script_set(self) -> AugmentedScriptSet; +} + +impl MixedScript for &'_ str { + fn is_single_script(self) -> bool { + !AugmentedScriptSet::for_str(self).is_empty() + } + + fn resolve_script_set(self) -> AugmentedScriptSet { + self.into() + } +} diff --git a/src/tests.rs b/src/tests.rs index b1f074b..ed32eae 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -10,19 +10,19 @@ #[test] fn test_char() { - use super::IdentifierStatusChar; - assert_eq!(IdentifierStatusChar::identifier_allowed('A'), true); + use crate::GeneralSecurityProfile; + assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true); assert_eq!('A'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('0'), true); + assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true); assert_eq!('0'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('_'), true); + assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true); assert_eq!('_'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('\x00'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false); assert_eq!('\x00'.identifier_allowed(), false); // U+00B5 MICRO SIGN - assert_eq!(IdentifierStatusChar::identifier_allowed('µ'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false); assert_eq!('µ'.identifier_allowed(), false); // U+2160 ROMAN NUMERAL ONE - assert_eq!(IdentifierStatusChar::identifier_allowed('Ⅰ'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false); assert_eq!('Ⅰ'.identifier_allowed(), false); }