diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py index 1b588826010717..7109e3d164e485 100644 --- a/Lib/test/test_cmd_line_script.py +++ b/Lib/test/test_cmd_line_script.py @@ -652,7 +652,7 @@ def test_syntaxerror_invalid_escape_sequence_multi_line(self): self.assertEqual( stderr.splitlines()[-3:], [ b' foo = """\\q"""', - b' ^^^^^^^^', + b' ^^', b'SyntaxError: invalid escape sequence \'\\q\'' ], ) diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py index c7c6f684cd33f0..3d793427c9ab5d 100644 --- a/Lib/test/test_string_literals.py +++ b/Lib/test/test_string_literals.py @@ -118,7 +118,7 @@ def test_eval_str_invalid_escape(self): self.assertEqual(len(w), 1) self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -128,7 +128,7 @@ def test_eval_str_invalid_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid escape sequence '\z'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) self.assertEqual(exc.offset, 1) # Check that the warning is raised only once if there are syntax errors @@ -155,7 +155,7 @@ def test_eval_str_invalid_octal_escape(self): self.assertEqual(str(w[0].message), r"invalid octal escape sequence '\407'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -165,9 +165,32 @@ def test_eval_str_invalid_octal_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) self.assertEqual(exc.offset, 1) + def test_invalid_escape_locations_with_offset(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('error', category=SyntaxWarning) + with self.assertRaises(SyntaxError) as cm: + eval("\"'''''''''''''''''''''invalid\ Escape\"") + exc = cm.exception + self.assertEqual(w, []) + self.assertEqual(exc.msg, r"invalid escape sequence '\ '") + self.assertEqual(exc.filename, '') + self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.offset, 30) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('error', category=SyntaxWarning) + with self.assertRaises(SyntaxError) as cm: + eval("\"''Incorrect \ logic?\"") + exc = cm.exception + self.assertEqual(w, []) + self.assertEqual(exc.msg, r"invalid escape sequence '\ '") + self.assertEqual(exc.filename, '') + self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.offset, 14) + def test_eval_str_raw(self): self.assertEqual(eval(""" r'x' """), 'x') self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01') @@ -207,7 +230,7 @@ def test_eval_bytes_invalid_escape(self): self.assertEqual(len(w), 1) self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -217,7 +240,7 @@ def test_eval_bytes_invalid_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid escape sequence '\z'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) def test_eval_bytes_invalid_octal_escape(self): for i in range(0o400, 0o1000): @@ -231,7 +254,7 @@ def test_eval_bytes_invalid_octal_escape(self): self.assertEqual(str(w[0].message), r"invalid octal escape sequence '\407'") self.assertEqual(w[0].filename, '') - self.assertEqual(w[0].lineno, 1) + self.assertEqual(w[0].lineno, 2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('error', category=SyntaxWarning) @@ -241,7 +264,7 @@ def test_eval_bytes_invalid_octal_escape(self): self.assertEqual(w, []) self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'") self.assertEqual(exc.filename, '') - self.assertEqual(exc.lineno, 1) + self.assertEqual(exc.lineno, 2) def test_eval_bytes_raw(self): self.assertEqual(eval(""" br'x' """), b'x') diff --git a/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst b/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst new file mode 100644 index 00000000000000..098804fa92e804 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst @@ -0,0 +1,2 @@ +Fix location for SyntaxErrors of invalid escapes in the tokenizer. Patch by +Pablo Galindo diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 72f1349897683d..d1cb91d299880e 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -350,8 +350,8 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) { - Py_ssize_t size = p->tok->inp - p->tok->buf; - error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); + Py_ssize_t size = p->tok->inp - p->tok->line_start; + error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace"); } else if (p->tok->fp == NULL || p->tok->fp == stdin) { error_line = get_error_line_from_tokenizer_buffers(p, lineno); diff --git a/Parser/string_parser.c b/Parser/string_parser.c index 164f715e153eca..751b56d0ee0e2c 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -9,7 +9,7 @@ //// STRING HANDLING FUNCTIONS //// static int -warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) +warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t) { if (p->call_invalid_rules) { // Do not report warnings if we are in the second pass of the parser @@ -38,8 +38,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token else { category = PyExc_DeprecationWarning; } + + // Calculate the lineno and the col_offset of the invalid escape sequence + const char *start = buffer; + const char *end = first_invalid_escape; + int lineno = t->lineno; + int col_offset = t->col_offset; + while (start < end) { + if (*start == '\n') { + lineno++; + col_offset = 0; + } + else { + col_offset++; + } + start++; + } + + // Count the number of quotes in the token + char first_quote = 0; + if (lineno == t->lineno) { + int quote_count = 0; + char* tok = PyBytes_AsString(t->bytes); + for (int i = 0; i < PyBytes_Size(t->bytes); i++) { + if (tok[i] == '\'' || tok[i] == '\"') { + if (quote_count == 0) { + first_quote = tok[i]; + } + if (tok[i] == first_quote) { + quote_count++; + } + } else { + break; + } + } + + col_offset += quote_count; + } + if (PyErr_WarnExplicitObject(category, msg, p->tok->filename, - t->lineno, NULL, NULL) < 0) { + lineno, NULL, NULL) < 0) { if (PyErr_ExceptionMatches(category)) { /* Replace the Syntax/DeprecationWarning exception with a SyntaxError to get a more accurate error report */ @@ -50,11 +88,12 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token error location, if p->known_err_token is not set. */ p->known_err_token = t; if (octal) { - RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'", - first_invalid_escape); + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1, + "invalid octal escape sequence '\\%.3s'", first_invalid_escape); } else { - RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c); + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1, + "invalid escape sequence '\\%c'", c); } } Py_DECREF(msg); @@ -148,7 +187,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) // HACK: later we can simply pass the line no, since we don't preserve the tokens // when we are decoding the string but we preserve the line numbers. if (v != NULL && first_invalid_escape != NULL && t != NULL) { - if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { + if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) { /* We have not decref u before because first_invalid_escape points inside u. */ Py_XDECREF(u); @@ -170,7 +209,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) } if (first_invalid_escape != NULL) { - if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { + if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) { Py_DECREF(result); return NULL; }