Merge pull request RustPython#1824 from palaviv/fix-test-bytes

coolreader18 · web-flow · commit 1f8c80b9f774 · 2020-03-21T10:24:55.000-05:00
Escape octet in string
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
@@ -834,8 +834,6 @@ def test_xjust_int_error(self):
         self.assertRaises(TypeError, self.type2test(b'abc').ljust, 7, 32)
         self.assertRaises(TypeError, self.type2test(b'abc').rjust, 7, 32)
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_ord(self):
         b = self.type2test(b'\0A\x7f\x80\xff')
         self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],
diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
@@ -462,6 +462,19 @@ where
         }
     }
 
+    fn parse_octet(&mut self, first: char) -> char {
+        let mut octet_content = String::new();
+        octet_content.push(first);
+        while octet_content.len() < 3 {
+            if let Some('0'..='7') = self.chr0 {
+                octet_content.push(self.next_char().unwrap())
+            } else {
+                break;
+            }
+        }
+        u8::from_str_radix(&octet_content, 8).unwrap() as char
+    }
+
     fn lex_string(
         &mut self,
         is_bytes: bool,
@@ -521,8 +534,9 @@ where
                             }
                             Some('u') => string_content.push(self.unicode_literal(4)?),
                             Some('U') => string_content.push(self.unicode_literal(8)?),
-                            Some('x') if !is_bytes => string_content.push(self.unicode_literal(2)?),
+                            Some('x') => string_content.push(self.unicode_literal(2)?),
                             Some('v') => string_content.push('\x0b'),
+                            Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)),
                             Some(c) => {
                                 string_content.push('\\');
                                 string_content.push(c);
@@ -552,7 +566,7 @@ where
                             break;
                         }
                     } else {
-                        if c == '\n' && !triple_quoted {
+                        if (c == '\n' && !triple_quoted) || (is_bytes && !c.is_ascii()) {
                             return Err(LexicalError {
                                 error: LexicalErrorType::StringError,
                                 location: self.get_pos(),
@@ -572,21 +586,8 @@ where
         let end_pos = self.get_pos();
 
         let tok = if is_bytes {
-            if string_content.is_ascii() {
-                let value = if is_raw {
-                    string_content.into_bytes()
-                } else {
-                    lex_byte(string_content).map_err(|error| LexicalError {
-                        error,
-                        location: self.get_pos(),
-                    })?
-                };
-                Tok::Bytes { value }
-            } else {
-                return Err(LexicalError {
-                    error: LexicalErrorType::StringError,
-                    location: self.get_pos(),
-                });
+            Tok::Bytes {
+                value: string_content.chars().map(|c| c as u8).collect(),
             }
         } else {
             Tok::String {
@@ -1231,90 +1232,6 @@ where
     }
 }
 
-#[derive(Debug)]
-enum EscapeMode {
-    NORMAL,
-    HEX,
-    OCTET,
-}
-
-fn lex_byte(s: String) -> Result<Vec<u8>, LexicalErrorType> {
-    let mut res = vec![];
-    let mut escape: Option<EscapeMode> = None;
-    let mut escape_buffer = String::new();
-
-    let mut chars_iter = s.chars();
-    let mut next_char = chars_iter.next();
-
-    while let Some(c) = next_char {
-        match escape {
-            Some(EscapeMode::OCTET) => {
-                if let '0'..='7' = c {
-                    escape_buffer.push(c);
-                    next_char = chars_iter.next();
-                    if escape_buffer.len() < 3 {
-                        continue;
-                    }
-                }
-                res.push(u8::from_str_radix(&escape_buffer, 8).unwrap());
-                escape = None;
-                escape_buffer.clear();
-            }
-            Some(EscapeMode::HEX) => {
-                if c.is_ascii_hexdigit() {
-                    if escape_buffer.is_empty() {
-                        escape_buffer.push(c);
-                    } else {
-                        escape_buffer.push(c);
-                        res.push(u8::from_str_radix(&escape_buffer, 16).unwrap());
-                        escape = None;
-                        escape_buffer.clear();
-                    }
-                    next_char = chars_iter.next();
-                } else {
-                    return Err(LexicalErrorType::StringError);
-                }
-            }
-            Some(EscapeMode::NORMAL) => {
-                match c {
-                    '\\' => res.push(b'\\'),
-                    'x' => {
-                        escape = Some(EscapeMode::HEX);
-                        next_char = chars_iter.next();
-                        continue;
-                    }
-                    't' => res.push(b'\t'),
-                    'n' => res.push(b'\n'),
-                    'r' => res.push(b'\r'),
-                    '0'..='7' => {
-                        escape = Some(EscapeMode::OCTET);
-                        continue;
-                    }
-                    x => {
-                        res.push(b'\\');
-                        res.push(x as u8);
-                    }
-                }
-                escape = None;
-                next_char = chars_iter.next();
-            }
-            None => {
-                match c {
-                    '\\' => escape = Some(EscapeMode::NORMAL),
-                    x => res.push(x as u8),
-                }
-                next_char = chars_iter.next();
-            }
-        }
-    }
-    match escape {
-        Some(EscapeMode::OCTET) => res.push(u8::from_str_radix(&escape_buffer, 8).unwrap()),
-        Some(EscapeMode::HEX) => return Err(LexicalErrorType::StringError),
-        _ => (),
-    }
-    Ok(res)
-}
-
 #[cfg(test)]
 mod tests {
     use super::{make_tokenizer, NewlineHandler, Tok};
@@ -1642,7 +1559,7 @@ mod tests {
 
     #[test]
     fn test_string() {
-        let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\''"#;
+        let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\200\0a'"#;
         let tokens = lex_source(source);
         assert_eq!(
             tokens,
@@ -1675,6 +1592,10 @@ mod tests {
                     value: String::from("raw\'"),
                     is_fstring: false,
                 },
+                Tok::String {
+                    value: String::from("\u{80}\u{0}a"),
+                    is_fstring: false,
+                },
                 Tok::Newline,
             ]
         );