diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 1783da8c10..0bdd1a37e8 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -834,8 +834,6 @@ def test_xjust_int_error(self): self.assertRaises(TypeError, self.type2test(b'abc').ljust, 7, 32) self.assertRaises(TypeError, self.type2test(b'abc').rjust, 7, 32) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_ord(self): b = self.type2test(b'\0A\x7f\x80\xff') self.assertEqual([ord(b[i:i+1]) for i in range(len(b))], diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index d43dc37bb6..9da8825ca6 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -462,6 +462,19 @@ where } } + fn parse_octet(&mut self, first: char) -> char { + let mut octet_content = String::new(); + octet_content.push(first); + while octet_content.len() < 3 { + if let Some('0'..='7') = self.chr0 { + octet_content.push(self.next_char().unwrap()) + } else { + break; + } + } + u8::from_str_radix(&octet_content, 8).unwrap() as char + } + fn lex_string( &mut self, is_bytes: bool, @@ -521,8 +534,9 @@ where } Some('u') => string_content.push(self.unicode_literal(4)?), Some('U') => string_content.push(self.unicode_literal(8)?), - Some('x') if !is_bytes => string_content.push(self.unicode_literal(2)?), + Some('x') => string_content.push(self.unicode_literal(2)?), Some('v') => string_content.push('\x0b'), + Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)), Some(c) => { string_content.push('\\'); string_content.push(c); @@ -552,7 +566,7 @@ where break; } } else { - if c == '\n' && !triple_quoted { + if (c == '\n' && !triple_quoted) || (is_bytes && !c.is_ascii()) { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), @@ -572,21 +586,8 @@ where let end_pos = self.get_pos(); let tok = if is_bytes { - if string_content.is_ascii() { - let value = if is_raw { - string_content.into_bytes() - } else { - lex_byte(string_content).map_err(|error| LexicalError { - error, - location: self.get_pos(), - })? - }; - Tok::Bytes { value } - } else { - return Err(LexicalError { - error: LexicalErrorType::StringError, - location: self.get_pos(), - }); + Tok::Bytes { + value: string_content.chars().map(|c| c as u8).collect(), } } else { Tok::String { @@ -1231,90 +1232,6 @@ where } } -#[derive(Debug)] -enum EscapeMode { - NORMAL, - HEX, - OCTET, -} - -fn lex_byte(s: String) -> Result, LexicalErrorType> { - let mut res = vec![]; - let mut escape: Option = None; - let mut escape_buffer = String::new(); - - let mut chars_iter = s.chars(); - let mut next_char = chars_iter.next(); - - while let Some(c) = next_char { - match escape { - Some(EscapeMode::OCTET) => { - if let '0'..='7' = c { - escape_buffer.push(c); - next_char = chars_iter.next(); - if escape_buffer.len() < 3 { - continue; - } - } - res.push(u8::from_str_radix(&escape_buffer, 8).unwrap()); - escape = None; - escape_buffer.clear(); - } - Some(EscapeMode::HEX) => { - if c.is_ascii_hexdigit() { - if escape_buffer.is_empty() { - escape_buffer.push(c); - } else { - escape_buffer.push(c); - res.push(u8::from_str_radix(&escape_buffer, 16).unwrap()); - escape = None; - escape_buffer.clear(); - } - next_char = chars_iter.next(); - } else { - return Err(LexicalErrorType::StringError); - } - } - Some(EscapeMode::NORMAL) => { - match c { - '\\' => res.push(b'\\'), - 'x' => { - escape = Some(EscapeMode::HEX); - next_char = chars_iter.next(); - continue; - } - 't' => res.push(b'\t'), - 'n' => res.push(b'\n'), - 'r' => res.push(b'\r'), - '0'..='7' => { - escape = Some(EscapeMode::OCTET); - continue; - } - x => { - res.push(b'\\'); - res.push(x as u8); - } - } - escape = None; - next_char = chars_iter.next(); - } - None => { - match c { - '\\' => escape = Some(EscapeMode::NORMAL), - x => res.push(x as u8), - } - next_char = chars_iter.next(); - } - } - } - match escape { - Some(EscapeMode::OCTET) => res.push(u8::from_str_radix(&escape_buffer, 8).unwrap()), - Some(EscapeMode::HEX) => return Err(LexicalErrorType::StringError), - _ => (), - } - Ok(res) -} - #[cfg(test)] mod tests { use super::{make_tokenizer, NewlineHandler, Tok}; @@ -1642,7 +1559,7 @@ mod tests { #[test] fn test_string() { - let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\''"#; + let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\200\0a'"#; let tokens = lex_source(source); assert_eq!( tokens, @@ -1675,6 +1592,10 @@ mod tests { value: String::from("raw\'"), is_fstring: false, }, + Tok::String { + value: String::from("\u{80}\u{0}a"), + is_fstring: false, + }, Tok::Newline, ] );