Skip to content

Commit 8276803

Browse files
authored
Merge pull request RustPython#1869 from RustPython/coolreader18/unicode-name-escape
Remove unnecessary unic dependencies and add `\N{}` unicode name escapes
2 parents 336698f + d7f2894 commit 8276803

File tree

7 files changed

+86
-199
lines changed

7 files changed

+86
-199
lines changed

Cargo.lock

Lines changed: 9 additions & 174 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_json/test_unicode.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,21 @@ class TestUnicode:
88
# test_encoding1 and test_encoding2 from 2.x are irrelevant (only str
99
# is supported as input, not bytes).
1010

11-
@unittest.skip("TODO: RUSTPYTHON")
1211
def test_encoding3(self):
1312
u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
1413
j = self.dumps(u)
1514
self.assertEqual(j, '"\\u03b1\\u03a9"')
1615

17-
@unittest.skip("TODO: RUSTPYTHON")
1816
def test_encoding4(self):
1917
u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
2018
j = self.dumps([u])
2119
self.assertEqual(j, '["\\u03b1\\u03a9"]')
2220

23-
@unittest.skip("TODO: RUSTPYTHON")
2421
def test_encoding5(self):
2522
u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
2623
j = self.dumps(u, ensure_ascii=False)
2724
self.assertEqual(j, '"{0}"'.format(u))
2825

29-
@unittest.skip("TODO: RUSTPYTHON")
3026
def test_encoding6(self):
3127
u = '\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
3228
j = self.dumps([u], ensure_ascii=False)

parser/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ log="0.4.1"
1717
num-bigint = "0.2"
1818
num-traits = "0.2"
1919
unic-emoji-char = "0.9"
20-
unic-ucd-ident = "0.9"
20+
unic-ucd-ident = "0.9"
21+
unicode_names2 = "0.4"

parser/src/lexer.rs

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,37 @@ where
475475
u8::from_str_radix(&octet_content, 8).unwrap() as char
476476
}
477477

478+
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
479+
let start_pos = self.get_pos();
480+
match self.next_char() {
481+
Some('{') => {}
482+
_ => {
483+
return Err(LexicalError {
484+
error: LexicalErrorType::StringError,
485+
location: start_pos,
486+
})
487+
}
488+
}
489+
let start_pos = self.get_pos();
490+
let mut name = String::new();
491+
loop {
492+
match self.next_char() {
493+
Some('}') => break,
494+
Some(c) => name.push(c),
495+
None => {
496+
return Err(LexicalError {
497+
error: LexicalErrorType::StringError,
498+
location: self.get_pos(),
499+
})
500+
}
501+
}
502+
}
503+
unicode_names2::character(&name).ok_or(LexicalError {
504+
error: LexicalErrorType::UnicodeError,
505+
location: start_pos,
506+
})
507+
}
508+
478509
fn lex_string(
479510
&mut self,
480511
is_bytes: bool,
@@ -532,11 +563,14 @@ where
532563
Some('t') => {
533564
string_content.push('\t');
534565
}
535-
Some('u') => string_content.push(self.unicode_literal(4)?),
536-
Some('U') => string_content.push(self.unicode_literal(8)?),
537-
Some('x') => string_content.push(self.unicode_literal(2)?),
538566
Some('v') => string_content.push('\x0b'),
539567
Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)),
568+
Some('x') => string_content.push(self.unicode_literal(2)?),
569+
Some('u') if !is_bytes => string_content.push(self.unicode_literal(4)?),
570+
Some('U') if !is_bytes => string_content.push(self.unicode_literal(8)?),
571+
Some('N') if !is_bytes => {
572+
string_content.push(self.parse_unicode_name()?)
573+
}
540574
Some(c) => {
541575
string_content.push('\\');
542576
string_content.push(c);
@@ -1687,4 +1721,20 @@ mod tests {
16871721
]
16881722
)
16891723
}
1724+
1725+
#[test]
1726+
fn test_escape_unicode_name() {
1727+
let source = r#""\N{EN SPACE}""#;
1728+
let tokens = lex_source(source);
1729+
assert_eq!(
1730+
tokens,
1731+
vec![
1732+
Tok::String {
1733+
value: "\u{2002}".to_owned(),
1734+
is_fstring: false,
1735+
},
1736+
Tok::Newline
1737+
]
1738+
)
1739+
}
16901740
}

0 commit comments

Comments
 (0)