From 8c069633d4fc403475b6168a09e3f4e04a9c1bc6 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 11 May 2020 13:50:59 +0300
Subject: [PATCH 1/3] bpo-40593: Improve syntax errors for invalid characters
 in source code.

---
 Include/cpython/unicodeobject.h               |  2 +
 Include/errcode.h                             |  1 -
 Lib/test/test_fstring.py                      |  2 +-
 Lib/test/test_source_encoding.py              |  3 ++
 Lib/test/test_unicode_identifiers.py          |  8 ++--
 .../2020-05-11-13-50-52.bpo-40593.yuOXj3.rst  |  1 +
 Objects/unicodeobject.c                       | 37 +++++++++++++++++
 Parser/pegen/pegen.c                          |  3 --
 Parser/tokenizer.c                            | 41 +++++++++++++++----
 Python/pythonrun.c                            |  3 --
 10 files changed, 81 insertions(+), 20 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst

diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index 81a35cdc801d09..94326876292b63 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
    and where the hash values are equal (i.e. a very probable match) */
 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
 
+PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/errcode.h b/Include/errcode.h
index b37cd261d5ec4d..790518b8b7730e 100644
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -29,7 +29,6 @@ extern "C" {
 #define E_EOFS          23      /* EOF in triple-quoted string */
 #define E_EOLS          24      /* EOL in single-quoted string */
 #define E_LINECONT      25      /* Unexpected characters after a line continuation */
-#define E_IDENTIFIER    26      /* Invalid characters in identifier */
 #define E_BADSINGLE     27      /* Ill-formed single statement input */
 
 #ifdef __cplusplus
diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py
index ac5aa9a76efe7c..393be624b6ae25 100644
--- a/Lib/test/test_fstring.py
+++ b/Lib/test/test_fstring.py
@@ -583,7 +583,7 @@ def test_missing_expression(self):
                              ])
 
         # Different error message is raised for other whitespace characters.
-        self.assertAllRaise(SyntaxError, 'invalid character in identifier',
+        self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)",
                             ["f'''{\xa0}'''",
                              "\xa0",
                              ])
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index a0bd741c36ac29..5ca43461d9940d 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -57,6 +57,9 @@ def test_issue7820(self):
         # one byte in common with the UTF-16-LE BOM
         self.assertRaises(SyntaxError, eval, b'\xff\x20')
 
+        # one byte in common with the UTF-8 BOM
+        self.assertRaises(SyntaxError, eval, b'\xef\x20')
+
         # two bytes in common with the UTF-8 BOM
         self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
 
diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py
index 07332c4631903e..5b9ced5d1cb837 100644
--- a/Lib/test/test_unicode_identifiers.py
+++ b/Lib/test/test_unicode_identifiers.py
@@ -20,9 +20,11 @@ def test_non_bmp_normalized(self):
     def test_invalid(self):
         try:
             from test import badsyntax_3131
-        except SyntaxError as s:
-            self.assertEqual(str(s),
-              "invalid character in identifier (badsyntax_3131.py, line 2)")
+        except SyntaxError as err:
+            self.assertEqual(str(err),
+              "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
+            self.assertEqual(err.lineno, 2)
+            self.assertEqual(err.offset, 1)
         else:
             self.fail("expected exception didn't occur")
 
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst
new file mode 100644
index 00000000000000..5587d4f49ccf97
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst	
@@ -0,0 +1 @@
+Improved syntax errors for invalid characters in source code.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 18b9458721de18..4c79bd18af2827 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12309,6 +12309,43 @@ unicode_isnumeric_impl(PyObject *self)
     Py_RETURN_TRUE;
 }
 
+Py_ssize_t
+_PyUnicode_ScanIdentifier(PyObject *self)
+{
+    Py_ssize_t i;
+    if (PyUnicode_READY(self) == -1)
+        return -1;
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    if (len == 0) {
+        /* an empty string is not a valid identifier */
+        return 0;
+    }
+
+    int kind = PyUnicode_KIND(self);
+    const void *data = PyUnicode_DATA(self);
+    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
+    /* PEP 3131 says that the first character must be in
+       XID_Start and subsequent characters in XID_Continue,
+       and for the ASCII range, the 2.x rules apply (i.e
+       start with letters and underscore, continue with
+       letters, digits, underscore). However, given the current
+       definition of XID_Start and XID_Continue, it is sufficient
+       to check just for these, except that _ must be allowed
+       as starting an identifier.  */
+    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
+        return 0;
+    }
+
+    for (i = 1; i < len; i++) {
+        ch = PyUnicode_READ(kind, data, i);
+        if (!_PyUnicode_IsXidContinue(ch)) {
+            return i;
+        }
+    }
+    return i;
+}
+
 int
 PyUnicode_IsIdentifier(PyObject *self)
 {
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c
index c80f08668b07d6..5f8c862c1f88be 100644
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
         case E_TOKEN:
             msg = "invalid token";
             break;
-        case E_IDENTIFIER:
-            msg = "invalid character in identifier";
-            break;
         case E_EOFS:
             RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
             return -1;
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 0f2b6af5e50adf..6d6f930a0df633 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1101,25 +1101,48 @@ static int
 verify_identifier(struct tok_state *tok)
 {
     PyObject *s;
-    int result;
     if (tok->decoding_erred)
         return 0;
     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
     if (s == NULL) {
         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
-            PyErr_Clear();
-            tok->done = E_IDENTIFIER;
-        } else {
+            tok->done = E_DECODE;
+        }
+        else {
             tok->done = E_ERROR;
         }
         return 0;
     }
-    result = PyUnicode_IsIdentifier(s);
-    Py_DECREF(s);
-    if (result == 0) {
-        tok->done = E_IDENTIFIER;
+    Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
+    if (invalid < 0) {
+        Py_DECREF(s);
+        tok->done = E_ERROR;
+        return 0;
     }
-    return result;
+    assert(PyUnicode_GET_LENGTH(s) > 0);
+    if (invalid < PyUnicode_GET_LENGTH(s)) {
+        Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
+        if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
+            /* Determine the offset in UTF-8 encoded input */
+            Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
+            if (s != NULL) {
+                Py_SETREF(s, PyUnicode_AsUTF8String(s));
+            }
+            if (s == NULL) {
+                tok->done = E_ERROR;
+                return 0;
+            }
+            tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
+        }
+        Py_DECREF(s);
+        // PyUnicode_FromFormatV() does not support %X
+        char hex[9];
+        snprintf(hex, sizeof(hex), "%04X", ch);
+        syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+        return 0;
+    }
+    Py_DECREF(s);
+    return 1;
 }
 
 static int
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index 1b79a33c814da1..45f08b707eb999 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
         msg = "unexpected character after line continuation character";
         break;
 
-    case E_IDENTIFIER:
-        msg = "invalid character in identifier";
-        break;
     case E_BADSINGLE:
         msg = "multiple statements found while compiling a single statement";
         break;

From e543b404cac4ec0b5dac18da0d89a46d018c7924 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 11 May 2020 15:03:16 +0300
Subject: [PATCH 2/3] Omit non-printable characters.

---
 Lib/test/test_fstring.py | 2 +-
 Parser/tokenizer.c       | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py
index 393be624b6ae25..e0bb5b56b2614f 100644
--- a/Lib/test/test_fstring.py
+++ b/Lib/test/test_fstring.py
@@ -583,7 +583,7 @@ def test_missing_expression(self):
                              ])
 
         # Different error message is raised for other whitespace characters.
-        self.assertAllRaise(SyntaxError, r"invalid character '\u00a0' \(U\+00A0\)",
+        self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
                             ["f'''{\xa0}'''",
                              "\xa0",
                              ])
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 6d6f930a0df633..b81fa118f216eb 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1138,7 +1138,12 @@ verify_identifier(struct tok_state *tok)
         // PyUnicode_FromFormatV() does not support %X
         char hex[9];
         snprintf(hex, sizeof(hex), "%04X", ch);
-        syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+        if (Py_UNICODE_ISPRINTABLE(ch)) {
+            syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
+        }
+        else {
+            syntaxerror(tok, "invalid non-printable character U+%s", hex);
+        }
         return 0;
     }
     Py_DECREF(s);

From 5f3b60ea9d70cc1c547f75593e1eb4d1b51da8a6 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 11 May 2020 19:04:22 +0300
Subject: [PATCH 3/3] Refactor PyUnicode_IsIdentifier().

---
 Objects/unicodeobject.c | 57 ++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 38 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4c79bd18af2827..276547ca48a5b2 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12349,52 +12349,33 @@ _PyUnicode_ScanIdentifier(PyObject *self)
 int
 PyUnicode_IsIdentifier(PyObject *self)
 {
-    Py_ssize_t i;
-    int ready = PyUnicode_IS_READY(self);
-
-    Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
-    if (len == 0) {
+    if (PyUnicode_IS_READY(self)) {
+        Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
+        Py_ssize_t len = PyUnicode_GET_LENGTH(self);
         /* an empty string is not a valid identifier */
-        return 0;
-    }
-
-    int kind = 0;
-    const void *data = NULL;
-    const wchar_t *wstr = NULL;
-    Py_UCS4 ch;
-    if (ready) {
-        kind = PyUnicode_KIND(self);
-        data = PyUnicode_DATA(self);
-        ch = PyUnicode_READ(kind, data, 0);
+        return len && i == len;
     }
     else {
-        wstr = _PyUnicode_WSTR(self);
-        ch = wstr[0];
-    }
-    /* PEP 3131 says that the first character must be in
-       XID_Start and subsequent characters in XID_Continue,
-       and for the ASCII range, the 2.x rules apply (i.e
-       start with letters and underscore, continue with
-       letters, digits, underscore). However, given the current
-       definition of XID_Start and XID_Continue, it is sufficient
-       to check just for these, except that _ must be allowed
-       as starting an identifier.  */
-    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
-        return 0;
-    }
+        Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
+        if (len == 0) {
+            /* an empty string is not a valid identifier */
+            return 0;
+        }
 
-    for (i = 1; i < len; i++) {
-        if (ready) {
-            ch = PyUnicode_READ(kind, data, i);
+        const wchar_t *wstr = _PyUnicode_WSTR(self);
+        Py_UCS4 ch = wstr[0];
+        if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
+            return 0;
         }
-        else {
+
+        for (i = 1; i < len; i++) {
             ch = wstr[i];
+            if (!_PyUnicode_IsXidContinue(ch)) {
+                return 0;
+            }
         }
-        if (!_PyUnicode_IsXidContinue(ch)) {
-            return 0;
-        }
+        return 1;
     }
-    return 1;
 }
 
 /*[clinic input]