From 90012e7d6afb4148df33e4d95d2a6b57818bf766 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Thu, 7 Aug 2025 09:10:57 -0500 Subject: [PATCH 1/3] tests: Add test of invalid unicode strings. .. from non UTF-8 inputs. In this case, MicroPython raises UnicodeError while CPython uses SyntaxError. By catching either exception, the test does not require an .exp file. Signed-off-by: Jeff Epler --- tests/unicode/unicode_parser.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/unicode/unicode_parser.py diff --git a/tests/unicode/unicode_parser.py b/tests/unicode/unicode_parser.py new file mode 100644 index 0000000000000..b7859dcbcbfaa --- /dev/null +++ b/tests/unicode/unicode_parser.py @@ -0,0 +1,23 @@ +# test invalid UTF-8 string via eval +# Passing byte strings to exec/eval is a micropython extension +try: + eval(b"'ab\xa1'") +except SyntaxError: + print("Exception") +try: + eval(b"'ab\xf8'") +except SyntaxError: + print("Exception") +try: + eval(bytearray(b"'ab\xc0a'")) +except SyntaxError: + print("Exception") +try: + eval(b"'\xf0\xe0\xed\xe8'") +except SyntaxError: + print("Exception") + +try: + exec(b"b\xff = 1") +except SyntaxError: + print("Exception") From a614243deb04ec3559f967b89dbdbf6d4becdef6 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Thu, 7 Aug 2025 07:55:05 -0500 Subject: [PATCH 2/3] py: Reduce code size from utf8_check. All sites immediately threw a UnicodeError, so roll that into the new function utf8_require. unicode.c was designed not to require runtime.h, so move the checking function into objstr.c. Reduce the number of #if sites by making a do-nothing variant that is used instead when !STR_UNICODE or !STR_UNICODE_CHECK. Signed-off-by: Jeff Epler --- py/objstr.c | 56 +++++++++++++++++++++++++++++++++++++--------------- py/objstr.h | 9 +++++++++ py/unicode.c | 32 ------------------------------ py/unicode.h | 1 - 4 files changed, 49 insertions(+), 49 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index c81fc682fd4e8..14932130058d3 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -208,11 +208,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ if (str_hash == 0) { str_hash = qstr_compute_hash(str_data, str_len); } - #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK - if (!utf8_check(str_data, str_len)) { - mp_raise_msg(&mp_type_UnicodeError, NULL); - } - #endif + mp_utf8_require(str_data, str_len); // Check if a qstr with this data already exists qstr q = qstr_find_strn((const char *)str_data, str_len); @@ -2285,17 +2281,13 @@ static mp_obj_t mp_obj_new_str_type_from_vstr(const mp_obj_type_t *type, vstr_t } mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr) { - #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK - if (!utf8_check((byte *)vstr->buf, vstr->len)) { - mp_raise_msg(&mp_type_UnicodeError, NULL); - } - #endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK + mp_utf8_require((byte *)vstr->buf, vstr->len); return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr); } #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr) { - // bypasses utf8_check. + // bypasses utf8_require. return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr); } #endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK @@ -2305,11 +2297,7 @@ mp_obj_t mp_obj_new_bytes_from_vstr(vstr_t *vstr) { } mp_obj_t mp_obj_new_str(const char *data, size_t len) { - #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK - if (!utf8_check((byte *)data, len)) { - mp_raise_msg(&mp_type_UnicodeError, NULL); - } - #endif + mp_utf8_require((byte *)data, len); qstr q = qstr_find_strn(data, len); if (q != MP_QSTRnull) { // qstr with this data already exists @@ -2471,3 +2459,39 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) { o->cur = 0; return MP_OBJ_FROM_PTR(o); } + +#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK +static bool mp_utf8_check(const byte *p, size_t len) { + uint8_t need = 0; + const byte *end = p + len; + for (; p < end; p++) { + byte c = *p; + if (need) { + if (UTF8_IS_CONT(c)) { + need--; + } else { + // mismatch + return 0; + } + } else { + if (c >= 0xc0) { + if (c >= 0xf8) { + // mismatch + return 0; + } + need = (0xe5 >> ((c >> 3) & 0x6)) & 3; + } else if (c >= 0x80) { + // mismatch + return 0; + } + } + } + return need == 0; // no pending fragments allowed +} + +void mp_utf8_require(const byte *p, size_t len) { + if (!mp_utf8_check(p, len)) { + mp_raise_msg(&mp_type_UnicodeError, NULL); + } +} +#endif diff --git a/py/objstr.h b/py/objstr.h index 028fc9597ffc8..2b87f27c56a51 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -119,4 +119,13 @@ extern const mp_obj_dict_t mp_obj_bytearray_locals_dict; extern const mp_obj_dict_t mp_obj_array_locals_dict; #endif +#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK +// Throws an exception if string content is not UTF-8 +void mp_utf8_require(const byte *p, size_t len); +#else +// If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op +static inline void mp_utf8_require(const byte *p, size_t len) { +} +#endif + #endif // MICROPY_INCLUDED_PY_OBJSTR_H diff --git a/py/unicode.c b/py/unicode.c index 81a37880f3c72..076ab71f6c8cd 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -177,35 +177,3 @@ mp_uint_t unichar_xdigit_value(unichar c) { } return n; } - -#if MICROPY_PY_BUILTINS_STR_UNICODE - -bool utf8_check(const byte *p, size_t len) { - uint8_t need = 0; - const byte *end = p + len; - for (; p < end; p++) { - byte c = *p; - if (need) { - if (UTF8_IS_CONT(c)) { - need--; - } else { - // mismatch - return 0; - } - } else { - if (c >= 0xc0) { - if (c >= 0xf8) { - // mismatch - return 0; - } - need = (0xe5 >> ((c >> 3) & 0x6)) & 3; - } else if (c >= 0x80) { - // mismatch - return 0; - } - } - } - return need == 0; // no pending fragments allowed -} - -#endif diff --git a/py/unicode.h b/py/unicode.h index c1fb517894f64..19487a65ae8b9 100644 --- a/py/unicode.h +++ b/py/unicode.h @@ -30,6 +30,5 @@ #include "py/misc.h" mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr); -bool utf8_check(const byte *p, size_t len); #endif // MICROPY_INCLUDED_PY_UNICODE_H From 90e366eefa80b27601bf30f3a1bdf7f562548859 Mon Sep 17 00:00:00 2001 From: Jeff Epler Date: Thu, 7 Aug 2025 09:02:57 -0500 Subject: [PATCH 3/3] parse: Don't allow creation of invalid UTF8 strings or identifiers. .. even when compiling non UTF-8 files or byte strings. Closes: #17855 Signed-off-by: Jeff Epler --- py/objstr.c | 2 +- py/objstr.h | 4 ++++ py/parse.c | 5 +++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/py/objstr.c b/py/objstr.c index 14932130058d3..087dc478b31b4 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -2461,7 +2461,7 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) { } #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK -static bool mp_utf8_check(const byte *p, size_t len) { +bool mp_utf8_check(const byte *p, size_t len) { uint8_t need = 0; const byte *end = p + len; for (; p < end; p++) { diff --git a/py/objstr.h b/py/objstr.h index 2b87f27c56a51..d09284045aa09 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -122,10 +122,14 @@ extern const mp_obj_dict_t mp_obj_array_locals_dict; #if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK // Throws an exception if string content is not UTF-8 void mp_utf8_require(const byte *p, size_t len); +bool mp_utf8_check(const byte *p, size_t len); #else // If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op static inline void mp_utf8_require(const byte *p, size_t len) { } +static inline bool mp_utf8_check(const byte *p, size_t len) { + return true; +} #endif #endif // MICROPY_INCLUDED_PY_OBJSTR_H diff --git a/py/parse.c b/py/parse.c index 1a50b13b5c790..4e818edcda1e4 100644 --- a/py/parse.c +++ b/py/parse.c @@ -598,6 +598,11 @@ static mp_parse_node_t make_node_const_object_optimised(parser_t *parser, size_t static void push_result_token(parser_t *parser, uint8_t rule_id) { mp_parse_node_t pn; mp_lexer_t *lex = parser->lexer; + if (lex->tok_kind == MP_TOKEN_NAME || lex->tok_kind == MP_TOKEN_STRING) { + if (!mp_utf8_check((byte *)lex->vstr.buf, lex->vstr.len)) { + mp_raise_msg(&mp_type_SyntaxError, NULL); + } + } if (lex->tok_kind == MP_TOKEN_NAME) { qstr id = qstr_from_strn(lex->vstr.buf, lex->vstr.len); #if MICROPY_COMP_CONST