Skip to content

Add mixed-script detection #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules.
exclude = [ "target/*", "Cargo.lock" ]

[dependencies]
unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false }
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
compiler_builtins = { version = "0.1", optional = true }
Expand Down
20 changes: 20 additions & 0 deletions src/general_security_profile.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile)
//! for identifiers

use crate::tables::identifier_status as is;

/// Methods for determining characters not restricted from use for identifiers.
pub trait GeneralSecurityProfile {
/// Returns whether the character is not restricted from use for identifiers.
fn identifier_allowed(self) -> bool;
}

impl GeneralSecurityProfile for char {
#[inline]
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
}

impl GeneralSecurityProfile for &'_ str {
#[inline]
fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) }
}
23 changes: 9 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
//! ```rust
//! extern crate unicode_security;
//!
//! use unicode_security::IdentifierStatusChar;
//! use unicode_security::GeneralSecurityProfile;
//!
//! fn main() {
//! let ch = 'µ'; // U+00B5 MICRO SIGN
Expand Down Expand Up @@ -55,21 +55,16 @@ extern crate std;
#[cfg(feature = "bench")]
extern crate test;

use tables::identifier_status as is;
pub use tables::UNICODE_VERSION;

mod tables;
pub mod mixed_script;
pub mod general_security_profile;

#[cfg(test)]
mod tests;
pub use mixed_script::MixedScript;
pub use general_security_profile::GeneralSecurityProfile;

/// Methods for determining characters not restricted from use for identifiers.
pub trait UnicodeIdentifierStatus {
/// Returns whether the character is not restricted from use for identifiers.
fn identifier_allowed(self) -> bool;
}
#[rustfmt::skip]
pub(crate) mod tables;

impl UnicodeIdentifierStatus for char {
#[inline]
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
}
#[cfg(test)]
mod tests;
129 changes: 129 additions & 0 deletions src/mixed_script.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)

use unicode_script::{Script, ScriptExtension};

/// An Augmented script set, as defined by UTS 39
///
/// https://www.unicode.org/reports/tr39/#def-augmented-script-set
pub struct AugmentedScriptSet {
/// The base ScriptExtension value
pub base: ScriptExtension,
/// Han With Bopomofo
pub hanb: bool,
/// Japanese
pub jpan: bool,
/// Korean
pub kore: bool,
}

impl From<ScriptExtension> for AugmentedScriptSet {
fn from(ext: ScriptExtension) -> Self {
let mut hanb = false;
let mut jpan = false;
let mut kore = false;

if ext == ScriptExtension::Single(Script::Common) ||
ext == ScriptExtension::Single(Script::Inherited) ||
ext.contains_script(Script::Han) {
hanb = true;
jpan = true;
kore = true;
} else {
if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) {
jpan = true;
}

if ext.contains_script(Script::Hangul) {
kore = true;
}

if ext.contains_script(Script::Bopomofo) {
hanb = true;
}
}
Self {
base: ext,
hanb, jpan, kore
}
}
}

impl From<char> for AugmentedScriptSet {
fn from(c: char) -> Self {
AugmentedScriptSet::for_char(c)
}
}

impl From<&'_ str> for AugmentedScriptSet {
fn from(s: &'_ str) -> Self {
AugmentedScriptSet::for_str(s)
}
}

impl Default for AugmentedScriptSet {
fn default() -> Self {
AugmentedScriptSet {
base: ScriptExtension::Single(Script::Common),
hanb: true,
jpan: true,
kore: true,
}
}
}

impl AugmentedScriptSet {
/// Intersect this set with another
pub fn intersect(mut self, other: Self) -> Self {
self.base = self.base.intersect(other.base);
self.hanb = self.hanb && other.hanb;
self.jpan = self.jpan && other.jpan;
self.kore = self.kore && other.kore;
self
}

/// Check if the set is empty
pub fn is_empty(&self) -> bool {
self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore
}

/// Check if the set is "All" (Common or Inherited)
pub fn is_all(&self) -> bool {
self.base == ScriptExtension::Single(Script::Common) ||
self.base == ScriptExtension::Single(Script::Inherited)
}

/// Construct an AugmentedScriptSet for a given character
pub fn for_char(c: char) -> Self {
ScriptExtension::from(c).into()
}

/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
pub fn for_str(s: &str) -> Self {
let mut set = AugmentedScriptSet::default();
for ch in s.chars() {
set = set.intersect(ch.into())
}
set
}
}

/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
pub trait MixedScript {
/// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script)
///
/// Note that a single-script string may still contain multiple Script properties!
fn is_single_script(self) -> bool;

/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
fn resolve_script_set(self) -> AugmentedScriptSet;
}

impl MixedScript for &'_ str {
fn is_single_script(self) -> bool {
!AugmentedScriptSet::for_str(self).is_empty()
}

fn resolve_script_set(self) -> AugmentedScriptSet {
self.into()
}
}
14 changes: 7 additions & 7 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,19 @@

#[test]
fn test_char() {
use super::IdentifierStatusChar;
assert_eq!(IdentifierStatusChar::identifier_allowed('A'), true);
use crate::GeneralSecurityProfile;
assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true);
assert_eq!('A'.identifier_allowed(), true);
assert_eq!(IdentifierStatusChar::identifier_allowed('0'), true);
assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true);
assert_eq!('0'.identifier_allowed(), true);
assert_eq!(IdentifierStatusChar::identifier_allowed('_'), true);
assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true);
assert_eq!('_'.identifier_allowed(), true);
assert_eq!(IdentifierStatusChar::identifier_allowed('\x00'), false);
assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false);
assert_eq!('\x00'.identifier_allowed(), false);
// U+00B5 MICRO SIGN
assert_eq!(IdentifierStatusChar::identifier_allowed('µ'), false);
assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false);
assert_eq!('µ'.identifier_allowed(), false);
// U+2160 ROMAN NUMERAL ONE
assert_eq!(IdentifierStatusChar::identifier_allowed('Ⅰ'), false);
assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false);
assert_eq!('Ⅰ'.identifier_allowed(), false);
}