From 8c13eeaf217bac6117f598dbd16c946fdd4bc547 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 13 Apr 2020 22:43:57 -0500 Subject: [PATCH 1/4] Specify individual unic dependencies --- Cargo.lock | 182 ++--------------------------------- parser/Cargo.toml | 2 +- vm/Cargo.toml | 19 ++-- vm/src/obj/objstr.rs | 4 +- vm/src/stdlib/unicodedata.rs | 16 ++- 5 files changed, 31 insertions(+), 192 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 20fb3a0640..6f9efb835a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1614,8 +1614,12 @@ dependencies = [ "statrs", "subprocess", "uname", - "unic", - "unic-common", + "unic-bidi", + "unic-char-property", + "unic-normal", + "unic-ucd-age", + "unic-ucd-category", + "unic-ucd-ident", "unicode-casing", "unicode_names2", "volatile", @@ -2002,22 +2006,6 @@ dependencies = [ "libc", ] -[[package]] -name = "unic" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e31748f3e294dc6a9243a44686e8155a162af9a11cd56e07c0ebbc530b2a8a87" -dependencies = [ - "unic-bidi", - "unic-char", - "unic-common", - "unic-emoji", - "unic-idna", - "unic-normal", - "unic-segment", - "unic-ucd", -] - [[package]] name = "unic-bidi" version = "0.9.0" @@ -2028,23 +2016,6 @@ dependencies = [ "unic-ucd-bidi", ] -[[package]] -name = "unic-char" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25df79bd134107f088ba725d9c470600f16263205d0be36c75e75b020bac0a" -dependencies = [ - "unic-char-basics", - "unic-char-property", - "unic-char-range", -] - -[[package]] -name = "unic-char-basics" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20e5d239bc6394309225a0c1b13e1d059565ff2cfef1a437aff4a5871fa06c4b" - [[package]] name = "unic-char-property" version = "0.9.0" @@ -2066,15 +2037,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" -[[package]] -name = "unic-emoji" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74193f32f7966ad20b819e70e29c6f1ac8c386692a9d5e90078eef80ea008bfb" -dependencies = [ - "unic-emoji-char", -] - [[package]] name = "unic-emoji-char" version = "0.9.0" @@ -2086,38 +2048,6 @@ dependencies = [ "unic-ucd-version", ] -[[package]] -name = "unic-idna" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "621e9cf526f2094d2c2ced579766458a92f8f422d6bb934c503ba1a95823a62d" -dependencies = [ - "matches", - "unic-idna-mapping", - "unic-idna-punycode", - "unic-normal", - "unic-ucd-bidi", - "unic-ucd-normal", - "unic-ucd-version", -] - -[[package]] -name = "unic-idna-mapping" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4de70fd4e5331537347a50a0dbc938efb1f127c9f6e5efec980fc90585aa1343" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-idna-punycode" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06feaedcbf9f1fc259144d833c0d630b8b15207b0486ab817d29258bc89f2f8a" - [[package]] name = "unic-normal" version = "0.9.0" @@ -2127,36 +2057,6 @@ dependencies = [ "unic-ucd-normal", ] -[[package]] -name = "unic-segment" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4ed5d26be57f84f176157270c112ef57b86debac9cd21daaabbe56db0f88f23" -dependencies = [ - "unic-ucd-segment", -] - -[[package]] -name = "unic-ucd" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "625b18f7601e1127504a20ae731dc3c7826d0e86d5f7fe3434f8137669240efd" -dependencies = [ - "unic-ucd-age", - "unic-ucd-bidi", - "unic-ucd-block", - "unic-ucd-case", - "unic-ucd-category", - "unic-ucd-common", - "unic-ucd-hangul", - "unic-ucd-ident", - "unic-ucd-name", - "unic-ucd-name_aliases", - "unic-ucd-normal", - "unic-ucd-segment", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-age" version = "0.9.0" @@ -2179,28 +2079,6 @@ dependencies = [ "unic-ucd-version", ] -[[package]] -name = "unic-ucd-block" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b2a16f2d7ecd25325a1053ca5a66e7fa1b68911a65c5e97f8d2e1b236b6f1d7" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-case" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d98d6246a79bac6cf66beee01422bda7c882e11d837fa4969bfaaba5fdea6d3" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-category" version = "0.9.0" @@ -2213,17 +2091,6 @@ dependencies = [ "unic-ucd-version", ] -[[package]] -name = "unic-ucd-common" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9b78b910beafa1aae5c59bf00877c6cece1c5db28a1241ad801e86cecdff4ad" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-hangul" version = "0.9.0" @@ -2244,27 +2111,6 @@ dependencies = [ "unic-ucd-version", ] -[[package]] -name = "unic-ucd-name" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8fc55a45b2531089dc1773bf60c1f104b38e434b774ffc37b9c29a9b0f492e" -dependencies = [ - "unic-char-property", - "unic-ucd-hangul", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-name_aliases" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b7674212643087699ba247a63dd05f1204c7e4880ec9342e545a7cffcc6a46f" -dependencies = [ - "unic-char-property", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-normal" version = "0.9.0" @@ -2273,22 +2119,10 @@ checksum = "86aed873b8202d22b13859dda5fe7c001d271412c31d411fd9b827e030569410" dependencies = [ "unic-char-property", "unic-char-range", - "unic-ucd-category", "unic-ucd-hangul", "unic-ucd-version", ] -[[package]] -name = "unic-ucd-segment" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2079c122a62205b421f499da10f3ee0f7697f012f55b675e002483c73ea34700" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-version" version = "0.9.0" @@ -2339,9 +2173,9 @@ checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" [[package]] name = "unicode_names2" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a928b876ff873d4a0ac966acce72423879dd86afcf190017aa700207188078" +checksum = "87d6678d7916394abad0d4b19df4d3802e1fd84abd7d701f39b75ee71b9e8cf1" [[package]] name = "utf8parse" diff --git a/parser/Cargo.toml b/parser/Cargo.toml index d34a5c9c04..fec83547b7 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -17,4 +17,4 @@ log="0.4.1" num-bigint = "0.2" num-traits = "0.2" unic-emoji-char = "0.9" -unic-ucd-ident = "0.9" +unic-ucd-ident = "0.9" diff --git a/vm/Cargo.toml b/vm/Cargo.toml index 840302fa98..6e1736b52c 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -53,12 +53,6 @@ hex = "0.4.0" hexf-parse = "0.1.0" indexmap = "1.0.2" crc = "^1.0.0" -unicode_names2 = "0.3" -# TODO: use unic for this; needed for title case: -# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 -unicode-casing = "0.1" -unic = "0.9" -unic-common = "0.9" maplit = "1.0" bitflags = "1.2.1" libc = "0.2" @@ -75,6 +69,19 @@ smallbox = "0.8" bstr = "0.2.12" crossbeam-utils = "0.7" +## unicode stuff +unicode_names2 = "0.4" +# TODO: use unic for this; needed for title case: +# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 +unicode-casing = "0.1" +# update version all at the same time +unic-bidi = "0.9" +unic-char-property = "0.9" +unic-normal = "0.9" +unic-ucd-category = "0.9" +unic-ucd-age = "0.9" +unic-ucd-ident = "0.9" + flame = { version = "0.2", optional = true } flamer = { version = "0.3", optional = true } diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index 4110cfb6d4..edaa777f16 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -7,8 +7,8 @@ use std::string::ToString; use crossbeam_utils::atomic::AtomicCell; use num_traits::ToPrimitive; -use unic::ucd::category::GeneralCategory; -use unic::ucd::ident::{is_xid_continue, is_xid_start}; +use unic_ucd_category::GeneralCategory; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; use unicode_casing::CharExt; use super::objbytes::{PyBytes, PyBytesRef}; diff --git a/vm/src/stdlib/unicodedata.rs b/vm/src/stdlib/unicodedata.rs index 713de4fe18..03c23064dc 100644 --- a/vm/src/stdlib/unicodedata.rs +++ b/vm/src/stdlib/unicodedata.rs @@ -9,12 +9,11 @@ use crate::pyobject::{PyClassImpl, PyObject, PyObjectRef, PyResult, PyValue}; use crate::vm::VirtualMachine; use itertools::Itertools; -use unic::bidi::BidiClass; -use unic::char::property::EnumeratedCharProperty; -use unic::normal::StrNormalForm; -use unic::ucd::category::GeneralCategory; -use unic::ucd::{Age, Name}; -use unic_common::version::UnicodeVersion; +use unic_bidi::BidiClass; +use unic_char_property::EnumeratedCharProperty; +use unic_normal::StrNormalForm; +use unic_ucd_age::{Age, UnicodeVersion, UNICODE_VERSION}; +use unic_ucd_category::GeneralCategory; pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { let ctx = &vm.ctx; @@ -71,7 +70,7 @@ impl Default for PyUCD { #[inline(always)] fn default() -> Self { PyUCD { - unic_version: unic::UNICODE_VERSION, + unic_version: UNICODE_VERSION, } } } @@ -105,7 +104,6 @@ impl PyUCD { #[pymethod] fn lookup(&self, name: PyStringRef, vm: &VirtualMachine) -> PyResult { - // TODO: we might want to use unic_ucd instead of unicode_names2 for this too, if possible: if let Some(character) = unicode_names2::character(name.as_str()) { if self.check_age(character) { return Ok(character.to_string()); @@ -125,7 +123,7 @@ impl PyUCD { if let Some(c) = c { if self.check_age(c) { - if let Some(name) = Name::of(c) { + if let Some(name) = unicode_names2::name(c) { return Ok(vm.new_str(name.to_string())); } } From c39ac3393ace7742f31a3d1a7365ca9e39de7314 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 13 Apr 2020 23:28:43 -0500 Subject: [PATCH 2/4] Add unicode name string escapes --- Cargo.lock | 1 + parser/Cargo.toml | 1 + parser/src/lexer.rs | 40 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f9efb835a..1c279a2aaf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1539,6 +1539,7 @@ dependencies = [ "num-traits", "unic-emoji-char", "unic-ucd-ident", + "unicode_names2", ] [[package]] diff --git a/parser/Cargo.toml b/parser/Cargo.toml index fec83547b7..0ec3b0e4e1 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -18,3 +18,4 @@ num-bigint = "0.2" num-traits = "0.2" unic-emoji-char = "0.9" unic-ucd-ident = "0.9" +unicode_names2 = "0.4" diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index f239e6e195..53f3bf5cc4 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -475,6 +475,37 @@ where u8::from_str_radix(&octet_content, 8).unwrap() as char } + fn parse_unicode_name(&mut self) -> Result { + let start_pos = self.get_pos(); + match self.next_char() { + Some('{') => {} + _ => { + return Err(LexicalError { + error: LexicalErrorType::StringError, + location: start_pos, + }) + } + } + let start_pos = self.get_pos(); + let mut name = String::new(); + loop { + match self.next_char() { + Some('}') => break, + Some(c) => name.push(c), + None => { + return Err(LexicalError { + error: LexicalErrorType::StringError, + location: self.get_pos(), + }) + } + } + } + unicode_names2::character(&name).ok_or(LexicalError { + error: LexicalErrorType::UnicodeError, + location: start_pos, + }) + } + fn lex_string( &mut self, is_bytes: bool, @@ -532,11 +563,14 @@ where Some('t') => { string_content.push('\t'); } - Some('u') => string_content.push(self.unicode_literal(4)?), - Some('U') => string_content.push(self.unicode_literal(8)?), - Some('x') => string_content.push(self.unicode_literal(2)?), Some('v') => string_content.push('\x0b'), Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)), + Some('x') => string_content.push(self.unicode_literal(2)?), + Some('u') if !is_bytes => string_content.push(self.unicode_literal(4)?), + Some('U') if !is_bytes => string_content.push(self.unicode_literal(8)?), + Some('N') if !is_bytes => { + string_content.push(self.parse_unicode_name()?) + } Some(c) => { string_content.push('\\'); string_content.push(c); From d92cebd9533b520985761907ef438fe83d5741d0 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Tue, 14 Apr 2020 13:06:32 -0500 Subject: [PATCH 3/4] Unskip tests that depend on \N --- Lib/test/test_json/test_unicode.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py index 1e4fc56ebb..bcad9d96ee 100644 --- a/Lib/test/test_json/test_unicode.py +++ b/Lib/test/test_json/test_unicode.py @@ -8,25 +8,21 @@ class TestUnicode: # test_encoding1 and test_encoding2 from 2.x are irrelevant (only str # is supported as input, not bytes). - @unittest.skip("TODO: RUSTPYTHON") def test_encoding3(self): u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = self.dumps(u) self.assertEqual(j, '"\\u03b1\\u03a9"') - @unittest.skip("TODO: RUSTPYTHON") def test_encoding4(self): u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = self.dumps([u]) self.assertEqual(j, '["\\u03b1\\u03a9"]') - @unittest.skip("TODO: RUSTPYTHON") def test_encoding5(self): u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = self.dumps(u, ensure_ascii=False) self.assertEqual(j, '"{0}"'.format(u)) - @unittest.skip("TODO: RUSTPYTHON") def test_encoding6(self): u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = self.dumps([u], ensure_ascii=False) From d7f28949e2da4e81c829adf2afef776603ff6a0b Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 15 Apr 2020 19:24:00 -0500 Subject: [PATCH 4/4] Add test for unicode escape --- parser/src/lexer.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index 53f3bf5cc4..8667165ea4 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -1721,4 +1721,20 @@ mod tests { ] ) } + + #[test] + fn test_escape_unicode_name() { + let source = r#""\N{EN SPACE}""#; + let tokens = lex_source(source); + assert_eq!( + tokens, + vec![ + Tok::String { + value: "\u{2002}".to_owned(), + is_fstring: false, + }, + Tok::Newline + ] + ) + } }