From 90012e7d6afb4148df33e4d95d2a6b57818bf766 Mon Sep 17 00:00:00 2001
From: Jeff Epler <jepler@gmail.com>
Date: Thu, 7 Aug 2025 09:10:57 -0500
Subject: [PATCH 1/3] tests: Add test of invalid unicode strings.

.. from non UTF-8 inputs. In this case, MicroPython raises
UnicodeError while CPython uses SyntaxError. By catching either
exception, the test does not require an .exp file.

Signed-off-by: Jeff Epler <jepler@gmail.com>
---
 tests/unicode/unicode_parser.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 tests/unicode/unicode_parser.py

diff --git a/tests/unicode/unicode_parser.py b/tests/unicode/unicode_parser.py
new file mode 100644
index 0000000000000..b7859dcbcbfaa
--- /dev/null
+++ b/tests/unicode/unicode_parser.py
@@ -0,0 +1,23 @@
+# test invalid UTF-8 string via eval
+# Passing byte strings to exec/eval is a micropython extension
+try:
+    eval(b"'ab\xa1'")
+except SyntaxError:
+    print("Exception")
+try:
+    eval(b"'ab\xf8'")
+except SyntaxError:
+    print("Exception")
+try:
+    eval(bytearray(b"'ab\xc0a'"))
+except SyntaxError:
+    print("Exception")
+try:
+    eval(b"'\xf0\xe0\xed\xe8'")
+except SyntaxError:
+    print("Exception")
+
+try:
+    exec(b"b\xff = 1")
+except SyntaxError:
+    print("Exception")

From a614243deb04ec3559f967b89dbdbf6d4becdef6 Mon Sep 17 00:00:00 2001
From: Jeff Epler <jepler@gmail.com>
Date: Thu, 7 Aug 2025 07:55:05 -0500
Subject: [PATCH 2/3] py: Reduce code size from utf8_check.

All sites immediately threw a UnicodeError, so roll that into
the new function utf8_require.

unicode.c was designed not to require runtime.h, so move the
checking function into objstr.c.

Reduce the number of #if sites by making a do-nothing variant
that is used instead when !STR_UNICODE or !STR_UNICODE_CHECK.

Signed-off-by: Jeff Epler <jepler@gmail.com>
---
 py/objstr.c  | 56 +++++++++++++++++++++++++++++++++++++---------------
 py/objstr.h  |  9 +++++++++
 py/unicode.c | 32 ------------------------------
 py/unicode.h |  1 -
 4 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/py/objstr.c b/py/objstr.c
index c81fc682fd4e8..14932130058d3 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -208,11 +208,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
                 if (str_hash == 0) {
                     str_hash = qstr_compute_hash(str_data, str_len);
                 }
-                #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
-                if (!utf8_check(str_data, str_len)) {
-                    mp_raise_msg(&mp_type_UnicodeError, NULL);
-                }
-                #endif
+                mp_utf8_require(str_data, str_len);
 
                 // Check if a qstr with this data already exists
                 qstr q = qstr_find_strn((const char *)str_data, str_len);
@@ -2285,17 +2281,13 @@ static mp_obj_t mp_obj_new_str_type_from_vstr(const mp_obj_type_t *type, vstr_t
 }
 
 mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr) {
-    #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
-    if (!utf8_check((byte *)vstr->buf, vstr->len)) {
-        mp_raise_msg(&mp_type_UnicodeError, NULL);
-    }
-    #endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+    mp_utf8_require((byte *)vstr->buf, vstr->len);
     return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
 }
 
 #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
 mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr) {
-    // bypasses utf8_check.
+    // bypasses utf8_require.
     return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
 }
 #endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
@@ -2305,11 +2297,7 @@ mp_obj_t mp_obj_new_bytes_from_vstr(vstr_t *vstr) {
 }
 
 mp_obj_t mp_obj_new_str(const char *data, size_t len) {
-    #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
-    if (!utf8_check((byte *)data, len)) {
-        mp_raise_msg(&mp_type_UnicodeError, NULL);
-    }
-    #endif
+    mp_utf8_require((byte *)data, len);
     qstr q = qstr_find_strn(data, len);
     if (q != MP_QSTRnull) {
         // qstr with this data already exists
@@ -2471,3 +2459,39 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
     o->cur = 0;
     return MP_OBJ_FROM_PTR(o);
 }
+
+#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+static bool mp_utf8_check(const byte *p, size_t len) {
+    uint8_t need = 0;
+    const byte *end = p + len;
+    for (; p < end; p++) {
+        byte c = *p;
+        if (need) {
+            if (UTF8_IS_CONT(c)) {
+                need--;
+            } else {
+                // mismatch
+                return 0;
+            }
+        } else {
+            if (c >= 0xc0) {
+                if (c >= 0xf8) {
+                    // mismatch
+                    return 0;
+                }
+                need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
+            } else if (c >= 0x80) {
+                // mismatch
+                return 0;
+            }
+        }
+    }
+    return need == 0; // no pending fragments allowed
+}
+
+void mp_utf8_require(const byte *p, size_t len) {
+    if (!mp_utf8_check(p, len)) {
+        mp_raise_msg(&mp_type_UnicodeError, NULL);
+    }
+}
+#endif
diff --git a/py/objstr.h b/py/objstr.h
index 028fc9597ffc8..2b87f27c56a51 100644
--- a/py/objstr.h
+++ b/py/objstr.h
@@ -119,4 +119,13 @@ extern const mp_obj_dict_t mp_obj_bytearray_locals_dict;
 extern const mp_obj_dict_t mp_obj_array_locals_dict;
 #endif
 
+#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
+// Throws an exception if string content is not UTF-8
+void mp_utf8_require(const byte *p, size_t len);
+#else
+// If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op
+static inline void mp_utf8_require(const byte *p, size_t len) {
+}
+#endif
+
 #endif // MICROPY_INCLUDED_PY_OBJSTR_H
diff --git a/py/unicode.c b/py/unicode.c
index 81a37880f3c72..076ab71f6c8cd 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -177,35 +177,3 @@ mp_uint_t unichar_xdigit_value(unichar c) {
     }
     return n;
 }
-
-#if MICROPY_PY_BUILTINS_STR_UNICODE
-
-bool utf8_check(const byte *p, size_t len) {
-    uint8_t need = 0;
-    const byte *end = p + len;
-    for (; p < end; p++) {
-        byte c = *p;
-        if (need) {
-            if (UTF8_IS_CONT(c)) {
-                need--;
-            } else {
-                // mismatch
-                return 0;
-            }
-        } else {
-            if (c >= 0xc0) {
-                if (c >= 0xf8) {
-                    // mismatch
-                    return 0;
-                }
-                need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
-            } else if (c >= 0x80) {
-                // mismatch
-                return 0;
-            }
-        }
-    }
-    return need == 0; // no pending fragments allowed
-}
-
-#endif
diff --git a/py/unicode.h b/py/unicode.h
index c1fb517894f64..19487a65ae8b9 100644
--- a/py/unicode.h
+++ b/py/unicode.h
@@ -30,6 +30,5 @@
 #include "py/misc.h"
 
 mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
-bool utf8_check(const byte *p, size_t len);
 
 #endif // MICROPY_INCLUDED_PY_UNICODE_H

From 90e366eefa80b27601bf30f3a1bdf7f562548859 Mon Sep 17 00:00:00 2001
From: Jeff Epler <jepler@gmail.com>
Date: Thu, 7 Aug 2025 09:02:57 -0500
Subject: [PATCH 3/3] parse: Don't allow creation of invalid UTF8 strings or
 identifiers.

.. even when compiling non UTF-8 files or byte strings.

Closes: #17855
Signed-off-by: Jeff Epler <jepler@gmail.com>
---
 py/objstr.c | 2 +-
 py/objstr.h | 4 ++++
 py/parse.c  | 5 +++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/py/objstr.c b/py/objstr.c
index 14932130058d3..087dc478b31b4 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -2461,7 +2461,7 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
 }
 
 #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
-static bool mp_utf8_check(const byte *p, size_t len) {
+bool mp_utf8_check(const byte *p, size_t len) {
     uint8_t need = 0;
     const byte *end = p + len;
     for (; p < end; p++) {
diff --git a/py/objstr.h b/py/objstr.h
index 2b87f27c56a51..d09284045aa09 100644
--- a/py/objstr.h
+++ b/py/objstr.h
@@ -122,10 +122,14 @@ extern const mp_obj_dict_t mp_obj_array_locals_dict;
 #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
 // Throws an exception if string content is not UTF-8
 void mp_utf8_require(const byte *p, size_t len);
+bool mp_utf8_check(const byte *p, size_t len);
 #else
 // If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op
 static inline void mp_utf8_require(const byte *p, size_t len) {
 }
+static inline bool mp_utf8_check(const byte *p, size_t len) {
+    return true;
+}
 #endif
 
 #endif // MICROPY_INCLUDED_PY_OBJSTR_H
diff --git a/py/parse.c b/py/parse.c
index 1a50b13b5c790..4e818edcda1e4 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -598,6 +598,11 @@ static mp_parse_node_t make_node_const_object_optimised(parser_t *parser, size_t
 static void push_result_token(parser_t *parser, uint8_t rule_id) {
     mp_parse_node_t pn;
     mp_lexer_t *lex = parser->lexer;
+    if (lex->tok_kind == MP_TOKEN_NAME || lex->tok_kind == MP_TOKEN_STRING) {
+        if (!mp_utf8_check((byte *)lex->vstr.buf, lex->vstr.len)) {
+            mp_raise_msg(&mp_type_SyntaxError, NULL);
+        }
+    }
     if (lex->tok_kind == MP_TOKEN_NAME) {
         qstr id = qstr_from_strn(lex->vstr.buf, lex->vstr.len);
         #if MICROPY_COMP_CONST