Skip to content

Commit 1f8c80b

Browse files
authored
Merge pull request RustPython#1824 from palaviv/fix-test-bytes
Escape octet in string
2 parents 44bd11f + 0444c1a commit 1f8c80b

File tree

2 files changed

+23
-104
lines changed

2 files changed

+23
-104
lines changed

Lib/test/test_bytes.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -834,8 +834,6 @@ def test_xjust_int_error(self):
834834
self.assertRaises(TypeError, self.type2test(b'abc').ljust, 7, 32)
835835
self.assertRaises(TypeError, self.type2test(b'abc').rjust, 7, 32)
836836

837-
# TODO: RUSTPYTHON
838-
@unittest.expectedFailure
839837
def test_ord(self):
840838
b = self.type2test(b'\0A\x7f\x80\xff')
841839
self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],

parser/src/lexer.rs

Lines changed: 23 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,19 @@ where
462462
}
463463
}
464464

465+
fn parse_octet(&mut self, first: char) -> char {
466+
let mut octet_content = String::new();
467+
octet_content.push(first);
468+
while octet_content.len() < 3 {
469+
if let Some('0'..='7') = self.chr0 {
470+
octet_content.push(self.next_char().unwrap())
471+
} else {
472+
break;
473+
}
474+
}
475+
u8::from_str_radix(&octet_content, 8).unwrap() as char
476+
}
477+
465478
fn lex_string(
466479
&mut self,
467480
is_bytes: bool,
@@ -521,8 +534,9 @@ where
521534
}
522535
Some('u') => string_content.push(self.unicode_literal(4)?),
523536
Some('U') => string_content.push(self.unicode_literal(8)?),
524-
Some('x') if !is_bytes => string_content.push(self.unicode_literal(2)?),
537+
Some('x') => string_content.push(self.unicode_literal(2)?),
525538
Some('v') => string_content.push('\x0b'),
539+
Some(o @ '0'..='7') => string_content.push(self.parse_octet(o)),
526540
Some(c) => {
527541
string_content.push('\\');
528542
string_content.push(c);
@@ -552,7 +566,7 @@ where
552566
break;
553567
}
554568
} else {
555-
if c == '\n' && !triple_quoted {
569+
if (c == '\n' && !triple_quoted) || (is_bytes && !c.is_ascii()) {
556570
return Err(LexicalError {
557571
error: LexicalErrorType::StringError,
558572
location: self.get_pos(),
@@ -572,21 +586,8 @@ where
572586
let end_pos = self.get_pos();
573587

574588
let tok = if is_bytes {
575-
if string_content.is_ascii() {
576-
let value = if is_raw {
577-
string_content.into_bytes()
578-
} else {
579-
lex_byte(string_content).map_err(|error| LexicalError {
580-
error,
581-
location: self.get_pos(),
582-
})?
583-
};
584-
Tok::Bytes { value }
585-
} else {
586-
return Err(LexicalError {
587-
error: LexicalErrorType::StringError,
588-
location: self.get_pos(),
589-
});
589+
Tok::Bytes {
590+
value: string_content.chars().map(|c| c as u8).collect(),
590591
}
591592
} else {
592593
Tok::String {
@@ -1231,90 +1232,6 @@ where
12311232
}
12321233
}
12331234

1234-
#[derive(Debug)]
1235-
enum EscapeMode {
1236-
NORMAL,
1237-
HEX,
1238-
OCTET,
1239-
}
1240-
1241-
fn lex_byte(s: String) -> Result<Vec<u8>, LexicalErrorType> {
1242-
let mut res = vec![];
1243-
let mut escape: Option<EscapeMode> = None;
1244-
let mut escape_buffer = String::new();
1245-
1246-
let mut chars_iter = s.chars();
1247-
let mut next_char = chars_iter.next();
1248-
1249-
while let Some(c) = next_char {
1250-
match escape {
1251-
Some(EscapeMode::OCTET) => {
1252-
if let '0'..='7' = c {
1253-
escape_buffer.push(c);
1254-
next_char = chars_iter.next();
1255-
if escape_buffer.len() < 3 {
1256-
continue;
1257-
}
1258-
}
1259-
res.push(u8::from_str_radix(&escape_buffer, 8).unwrap());
1260-
escape = None;
1261-
escape_buffer.clear();
1262-
}
1263-
Some(EscapeMode::HEX) => {
1264-
if c.is_ascii_hexdigit() {
1265-
if escape_buffer.is_empty() {
1266-
escape_buffer.push(c);
1267-
} else {
1268-
escape_buffer.push(c);
1269-
res.push(u8::from_str_radix(&escape_buffer, 16).unwrap());
1270-
escape = None;
1271-
escape_buffer.clear();
1272-
}
1273-
next_char = chars_iter.next();
1274-
} else {
1275-
return Err(LexicalErrorType::StringError);
1276-
}
1277-
}
1278-
Some(EscapeMode::NORMAL) => {
1279-
match c {
1280-
'\\' => res.push(b'\\'),
1281-
'x' => {
1282-
escape = Some(EscapeMode::HEX);
1283-
next_char = chars_iter.next();
1284-
continue;
1285-
}
1286-
't' => res.push(b'\t'),
1287-
'n' => res.push(b'\n'),
1288-
'r' => res.push(b'\r'),
1289-
'0'..='7' => {
1290-
escape = Some(EscapeMode::OCTET);
1291-
continue;
1292-
}
1293-
x => {
1294-
res.push(b'\\');
1295-
res.push(x as u8);
1296-
}
1297-
}
1298-
escape = None;
1299-
next_char = chars_iter.next();
1300-
}
1301-
None => {
1302-
match c {
1303-
'\\' => escape = Some(EscapeMode::NORMAL),
1304-
x => res.push(x as u8),
1305-
}
1306-
next_char = chars_iter.next();
1307-
}
1308-
}
1309-
}
1310-
match escape {
1311-
Some(EscapeMode::OCTET) => res.push(u8::from_str_radix(&escape_buffer, 8).unwrap()),
1312-
Some(EscapeMode::HEX) => return Err(LexicalErrorType::StringError),
1313-
_ => (),
1314-
}
1315-
Ok(res)
1316-
}
1317-
13181235
#[cfg(test)]
13191236
mod tests {
13201237
use super::{make_tokenizer, NewlineHandler, Tok};
@@ -1642,7 +1559,7 @@ mod tests {
16421559

16431560
#[test]
16441561
fn test_string() {
1645-
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\''"#;
1562+
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\200\0a'"#;
16461563
let tokens = lex_source(source);
16471564
assert_eq!(
16481565
tokens,
@@ -1675,6 +1592,10 @@ mod tests {
16751592
value: String::from("raw\'"),
16761593
is_fstring: false,
16771594
},
1595+
Tok::String {
1596+
value: String::from("\u{80}\u{0}a"),
1597+
is_fstring: false,
1598+
},
16781599
Tok::Newline,
16791600
]
16801601
);

0 commit comments

Comments
 (0)