Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 40 additions & 16 deletions py/objstr.c
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_
if (str_hash == 0) {
str_hash = qstr_compute_hash(str_data, str_len);
}
#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
if (!utf8_check(str_data, str_len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
#endif
mp_utf8_require(str_data, str_len);

// Check if a qstr with this data already exists
qstr q = qstr_find_strn((const char *)str_data, str_len);
Expand Down Expand Up @@ -2285,17 +2281,13 @@ static mp_obj_t mp_obj_new_str_type_from_vstr(const mp_obj_type_t *type, vstr_t
}

mp_obj_t mp_obj_new_str_from_vstr(vstr_t *vstr) {
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
if (!utf8_check((byte *)vstr->buf, vstr->len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
#endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
mp_utf8_require((byte *)vstr->buf, vstr->len);
return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
}

#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
mp_obj_t mp_obj_new_str_from_utf8_vstr(vstr_t *vstr) {
// bypasses utf8_check.
// bypasses utf8_require.
return mp_obj_new_str_type_from_vstr(&mp_type_str, vstr);
}
#endif // MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
Expand All @@ -2305,11 +2297,7 @@ mp_obj_t mp_obj_new_bytes_from_vstr(vstr_t *vstr) {
}

mp_obj_t mp_obj_new_str(const char *data, size_t len) {
#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
if (!utf8_check((byte *)data, len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
#endif
mp_utf8_require((byte *)data, len);
qstr q = qstr_find_strn(data, len);
if (q != MP_QSTRnull) {
// qstr with this data already exists
Expand Down Expand Up @@ -2471,3 +2459,39 @@ mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
o->cur = 0;
return MP_OBJ_FROM_PTR(o);
}

#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
bool mp_utf8_check(const byte *p, size_t len) {
uint8_t need = 0;
const byte *end = p + len;
for (; p < end; p++) {
byte c = *p;
if (need) {
if (UTF8_IS_CONT(c)) {
need--;
} else {
// mismatch
return 0;
}
} else {
if (c >= 0xc0) {
if (c >= 0xf8) {
// mismatch
return 0;
}
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
} else if (c >= 0x80) {
// mismatch
return 0;
}
}
}
return need == 0; // no pending fragments allowed
}

void mp_utf8_require(const byte *p, size_t len) {
if (!mp_utf8_check(p, len)) {
mp_raise_msg(&mp_type_UnicodeError, NULL);
}
}
#endif
13 changes: 13 additions & 0 deletions py/objstr.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,17 @@ extern const mp_obj_dict_t mp_obj_bytearray_locals_dict;
extern const mp_obj_dict_t mp_obj_array_locals_dict;
#endif

#if MICROPY_PY_BUILTINS_STR_UNICODE && MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
// Throws an exception if string content is not UTF-8
void mp_utf8_require(const byte *p, size_t len);
bool mp_utf8_check(const byte *p, size_t len);
#else
// If unicode strings are not enabled, or the check is explicitly disabled, it's a no-op
static inline void mp_utf8_require(const byte *p, size_t len) {
}
static inline bool mp_utf8_check(const byte *p, size_t len) {
return true;
}
#endif

#endif // MICROPY_INCLUDED_PY_OBJSTR_H
5 changes: 5 additions & 0 deletions py/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,11 @@ static mp_parse_node_t make_node_const_object_optimised(parser_t *parser, size_t
static void push_result_token(parser_t *parser, uint8_t rule_id) {
mp_parse_node_t pn;
mp_lexer_t *lex = parser->lexer;
if (lex->tok_kind == MP_TOKEN_NAME || lex->tok_kind == MP_TOKEN_STRING) {
if (!mp_utf8_check((byte *)lex->vstr.buf, lex->vstr.len)) {
mp_raise_msg(&mp_type_SyntaxError, NULL);
}
}
if (lex->tok_kind == MP_TOKEN_NAME) {
qstr id = qstr_from_strn(lex->vstr.buf, lex->vstr.len);
#if MICROPY_COMP_CONST
Expand Down
32 changes: 0 additions & 32 deletions py/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,35 +177,3 @@ mp_uint_t unichar_xdigit_value(unichar c) {
}
return n;
}

#if MICROPY_PY_BUILTINS_STR_UNICODE

bool utf8_check(const byte *p, size_t len) {
uint8_t need = 0;
const byte *end = p + len;
for (; p < end; p++) {
byte c = *p;
if (need) {
if (UTF8_IS_CONT(c)) {
need--;
} else {
// mismatch
return 0;
}
} else {
if (c >= 0xc0) {
if (c >= 0xf8) {
// mismatch
return 0;
}
need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
} else if (c >= 0x80) {
// mismatch
return 0;
}
}
}
return need == 0; // no pending fragments allowed
}

#endif
1 change: 0 additions & 1 deletion py/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,5 @@
#include "py/misc.h"

mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr);
bool utf8_check(const byte *p, size_t len);

#endif // MICROPY_INCLUDED_PY_UNICODE_H
23 changes: 23 additions & 0 deletions tests/unicode/unicode_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# test invalid UTF-8 string via eval
# Passing byte strings to exec/eval is a micropython extension
try:
eval(b"'ab\xa1'")
except SyntaxError:
print("Exception")
try:
eval(b"'ab\xf8'")
except SyntaxError:
print("Exception")
try:
eval(bytearray(b"'ab\xc0a'"))
except SyntaxError:
print("Exception")
try:
eval(b"'\xf0\xe0\xed\xe8'")
except SyntaxError:
print("Exception")

try:
exec(b"b\xff = 1")
except SyntaxError:
print("Exception")
Loading