From 8c069633d4fc403475b6168a09e3f4e04a9c1bc6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 13:50:59 +0300 Subject: [PATCH 1/3] bpo-40593: Improve syntax errors for invalid characters in source code. --- Include/cpython/unicodeobject.h | 2 + Include/errcode.h | 1 - Lib/test/test_fstring.py | 2 +- Lib/test/test_source_encoding.py | 3 ++ Lib/test/test_unicode_identifiers.py | 8 ++-- .../2020-05-11-13-50-52.bpo-40593.yuOXj3.rst | 1 + Objects/unicodeobject.c | 37 +++++++++++++++++ Parser/pegen/pegen.c | 3 -- Parser/tokenizer.c | 41 +++++++++++++++---- Python/pythonrun.c | 3 -- 10 files changed, 81 insertions(+), 20 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 81a35cdc801d09..94326876292b63 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); and where the hash values are equal (i.e. a very probable match) */ PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); +PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *); + #ifdef __cplusplus } #endif diff --git a/Include/errcode.h b/Include/errcode.h index b37cd261d5ec4d..790518b8b7730e 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -29,7 +29,6 @@ extern "C" { #define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_IDENTIFIER 26 /* Invalid characters in identifier */ #define E_BADSINGLE 27 /* Ill-formed single statement input */ #ifdef __cplusplus diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index ac5aa9a76efe7c..393be624b6ae25 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -583,7 +583,7 @@ def test_missing_expression(self): ]) # Different error message is raised for other whitespace characters. - self.assertAllRaise(SyntaxError, 'invalid character in identifier', + self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)", ["f'''{\xa0}'''", "\xa0", ]) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a0bd741c36ac29..5ca43461d9940d 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -57,6 +57,9 @@ def test_issue7820(self): # one byte in common with the UTF-16-LE BOM self.assertRaises(SyntaxError, eval, b'\xff\x20') + # one byte in common with the UTF-8 BOM + self.assertRaises(SyntaxError, eval, b'\xef\x20') + # two bytes in common with the UTF-8 BOM self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py index 07332c4631903e..5b9ced5d1cb837 100644 --- a/Lib/test/test_unicode_identifiers.py +++ b/Lib/test/test_unicode_identifiers.py @@ -20,9 +20,11 @@ def test_non_bmp_normalized(self): def test_invalid(self): try: from test import badsyntax_3131 - except SyntaxError as s: - self.assertEqual(str(s), - "invalid character in identifier (badsyntax_3131.py, line 2)") + except SyntaxError as err: + self.assertEqual(str(err), + "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)") + self.assertEqual(err.lineno, 2) + self.assertEqual(err.offset, 1) else: self.fail("expected exception didn't occur") diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst new file mode 100644 index 00000000000000..5587d4f49ccf97 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst @@ -0,0 +1 @@ +Improved syntax errors for invalid characters in source code. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b9458721de18..4c79bd18af2827 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12309,6 +12309,43 @@ unicode_isnumeric_impl(PyObject *self) Py_RETURN_TRUE; } +Py_ssize_t +_PyUnicode_ScanIdentifier(PyObject *self) +{ + Py_ssize_t i; + if (PyUnicode_READY(self) == -1) + return -1; + + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; + } + + int kind = PyUnicode_KIND(self); + const void *data = PyUnicode_DATA(self); + Py_UCS4 ch = PyUnicode_READ(kind, data, 0); + /* PEP 3131 says that the first character must be in + XID_Start and subsequent characters in XID_Continue, + and for the ASCII range, the 2.x rules apply (i.e + start with letters and underscore, continue with + letters, digits, underscore). However, given the current + definition of XID_Start and XID_Continue, it is sufficient + to check just for these, except that _ must be allowed + as starting an identifier. */ + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { + return 0; + } + + for (i = 1; i < len; i++) { + ch = PyUnicode_READ(kind, data, i); + if (!_PyUnicode_IsXidContinue(ch)) { + return i; + } + } + return i; +} + int PyUnicode_IsIdentifier(PyObject *self) { diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index c80f08668b07d6..5f8c862c1f88be 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -337,9 +337,6 @@ tokenizer_error(Parser *p) case E_TOKEN: msg = "invalid token"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_EOFS: RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); return -1; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0f2b6af5e50adf..6d6f930a0df633 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1101,25 +1101,48 @@ static int verify_identifier(struct tok_state *tok) { PyObject *s; - int result; if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { + tok->done = E_DECODE; + } + else { tok->done = E_ERROR; } return 0; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) { - tok->done = E_IDENTIFIER; + Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); + if (invalid < 0) { + Py_DECREF(s); + tok->done = E_ERROR; + return 0; } - return result; + assert(PyUnicode_GET_LENGTH(s) > 0); + if (invalid < PyUnicode_GET_LENGTH(s)) { + Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { + /* Determine the offset in UTF-8 encoded input */ + Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); + if (s != NULL) { + Py_SETREF(s, PyUnicode_AsUTF8String(s)); + } + if (s == NULL) { + tok->done = E_ERROR; + return 0; + } + tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); + } + Py_DECREF(s); + // PyUnicode_FromFormatV() does not support %X + char hex[9]; + snprintf(hex, sizeof(hex), "%04X", ch); + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + return 0; + } + Py_DECREF(s); + return 1; } static int diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 1b79a33c814da1..45f08b707eb999 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -1603,9 +1603,6 @@ err_input(perrdetail *err) msg = "unexpected character after line continuation character"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_BADSINGLE: msg = "multiple statements found while compiling a single statement"; break; From e543b404cac4ec0b5dac18da0d89a46d018c7924 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 15:03:16 +0300 Subject: [PATCH 2/3] Omit non-printable characters. --- Lib/test/test_fstring.py | 2 +- Parser/tokenizer.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 393be624b6ae25..e0bb5b56b2614f 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -583,7 +583,7 @@ def test_missing_expression(self): ]) # Different error message is raised for other whitespace characters. - self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)", + self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0", ["f'''{\xa0}'''", "\xa0", ]) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6d6f930a0df633..b81fa118f216eb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1138,7 +1138,12 @@ verify_identifier(struct tok_state *tok) // PyUnicode_FromFormatV() does not support %X char hex[9]; snprintf(hex, sizeof(hex), "%04X", ch); - syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + if (Py_UNICODE_ISPRINTABLE(ch)) { + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + } + else { + syntaxerror(tok, "invalid non-printable character U+%s", hex); + } return 0; } Py_DECREF(s); From 5f3b60ea9d70cc1c547f75593e1eb4d1b51da8a6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 19:04:22 +0300 Subject: [PATCH 3/3] Refactor PyUnicode_IsIdentifier(). --- Objects/unicodeobject.c | 57 ++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4c79bd18af2827..276547ca48a5b2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12349,52 +12349,33 @@ _PyUnicode_ScanIdentifier(PyObject *self) int PyUnicode_IsIdentifier(PyObject *self) { - Py_ssize_t i; - int ready = PyUnicode_IS_READY(self); - - Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); - if (len == 0) { + if (PyUnicode_IS_READY(self)) { + Py_ssize_t i = _PyUnicode_ScanIdentifier(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); /* an empty string is not a valid identifier */ - return 0; - } - - int kind = 0; - const void *data = NULL; - const wchar_t *wstr = NULL; - Py_UCS4 ch; - if (ready) { - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - ch = PyUnicode_READ(kind, data, 0); + return len && i == len; } else { - wstr = _PyUnicode_WSTR(self); - ch = wstr[0]; - } - /* PEP 3131 says that the first character must be in - XID_Start and subsequent characters in XID_Continue, - and for the ASCII range, the 2.x rules apply (i.e - start with letters and underscore, continue with - letters, digits, underscore). However, given the current - definition of XID_Start and XID_Continue, it is sufficient - to check just for these, except that _ must be allowed - as starting an identifier. */ - if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { - return 0; - } + Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; + } - for (i = 1; i < len; i++) { - if (ready) { - ch = PyUnicode_READ(kind, data, i); + const wchar_t *wstr = _PyUnicode_WSTR(self); + Py_UCS4 ch = wstr[0]; + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { + return 0; } - else { + + for (i = 1; i < len; i++) { ch = wstr[i]; + if (!_PyUnicode_IsXidContinue(ch)) { + return 0; + } } - if (!_PyUnicode_IsXidContinue(ch)) { - return 0; - } + return 1; } - return 1; } /*[clinic input]