Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c239f50
Add PEP 393-flags to strings and stub usage.
Rosuav Jun 3, 2014
89452be
Update comments - now aiming for UTF-8 rather than PEP 393 strings
Rosuav Jun 5, 2014
b0f41c7
Beginnings of UTF-8 support - construct strings from that many UTF-8-…
Rosuav Jun 5, 2014
47c2345
objstr: Record character length separately from byte length
Rosuav Jun 6, 2014
cd2cf66
HACK - When indexing a qstr, count its charlen. Stupidly inefficient …
Rosuav Jun 6, 2014
16429b8
Make len(s) return character length (even though creation's still buggy)
Rosuav Jun 6, 2014
6df1b94
Add test of UTF-8 encoded source file resulting in properly formed st…
Rosuav Jun 6, 2014
231031a
Add character length to qstr
Rosuav Jun 6, 2014
e924659
Add support for \u and \U escapes, but not \N (with explanatory comment)
Rosuav Jun 6, 2014
03f0cbe
Retain characters as UTF-8 encoded Unicode
Rosuav Jun 7, 2014
bb13212
Make ord() Unicode-aware
Rosuav Jun 7, 2014
7bc9190
Record byte lengths for byte strings
Rosuav Jun 7, 2014
01bd686
Expand the Unicode tests
Rosuav Jun 7, 2014
f51ad73
Make a string's repr Unicode-aware
Rosuav Jun 7, 2014
f1911f5
Make chr() Unicode-aware
Rosuav Jun 7, 2014
279de0c
Formatting/layout improvements - introduce macros for UTF-8 byte dete…
Rosuav Jun 7, 2014
f9bebb2
Whitespace fixes
Rosuav Jun 7, 2014
bc990da
Revert "Add PEP 393-flags to strings and stub usage."
Rosuav Jun 7, 2014
30d1bad
Make utf8_get_char() and utf8_next_char() actually do what their name…
Rosuav Jun 7, 2014
44b0d5c
Use utf8_get/next_char in building up a string's repr
Rosuav Jun 7, 2014
a019ba9
Add a unichar_charlen() function to calculate length-in-characters fr…
Rosuav Jun 7, 2014
5c1658e
Get rid of mp_obj_str_get_data_len() which was used in only one place
Rosuav Jun 7, 2014
5473e1a
Remove the charlen field from strings, calculating it when required
Rosuav Jun 7, 2014
0bcc7ab
Clean up constant qstr declarations now that charlen isn't needed
Rosuav Jun 8, 2014
a24d19f
Change string indexing to not precalculate the charlen, and add test …
Rosuav Jun 9, 2014
616c24a
Add tests of string slicing, which currently fail
Rosuav Jun 9, 2014
24371c7
Break out index-to-pointer calculation into a function
Rosuav Jun 9, 2014
0d339a1
Support slicing in str_index_to_ptr, and fix a bounds error
Rosuav Jun 9, 2014
086a2a0
Properly implement string slicing
Rosuav Jun 9, 2014
e2c9782
More whitespace fixups
Rosuav Jun 9, 2014
c962057
Merge branch 'master' into unicode, resolving conflict on py/obj.h
Rosuav Jun 9, 2014
5bf0153
Test a default (= UTF-8) encode and decode
Rosuav Jun 9, 2014
99dc21b
Optimize as per TODO (thanks Damien!)
Rosuav Jun 11, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions py/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,30 @@ STATIC mp_obj_t mp_builtin_callable(mp_obj_t o_in) {
MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_callable_obj, mp_builtin_callable);

STATIC mp_obj_t mp_builtin_chr(mp_obj_t o_in) {
int ord = mp_obj_get_int(o_in);
if (0 <= ord && ord <= 0x10ffff) {
char str[1] = {ord};
return mp_obj_new_str(str, 1, true);
int c = mp_obj_get_int(o_in);
char str[4];
int len = 0;
if (c < 0x80) {
*str = c; len = 1;
} else if (c < 0x800) {
str[0] = (c >> 6) | 0xC0;
str[1] = (c & 0x3F) | 0x80;
len = 2;
} else if (c < 0x10000) {
str[0] = (c >> 12) | 0xE0;
str[1] = ((c >> 6) & 0x3F) | 0x80;
str[2] = (c & 0x3F) | 0x80;
len = 3;
} else if (c < 0x110000) {
str[0] = (c >> 18) | 0xF0;
str[1] = ((c >> 12) & 0x3F) | 0x80;
str[2] = ((c >> 6) & 0x3F) | 0x80;
str[3] = (c & 0x3F) | 0x80;
len = 4;
} else {
nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)"));
}
return mp_obj_new_str(str, len, true);
}

MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_chr_obj, mp_builtin_chr);
Expand Down Expand Up @@ -342,12 +359,22 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_oct_obj, mp_builtin_oct);
STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) {
uint len;
const char *str = mp_obj_str_get_data(o_in, &len);
if (len == 1) {
// don't sign extend when converting to ord
// TODO unicode
return mp_obj_new_int(((const byte*)str)[0]);
uint charlen = unichar_charlen(str, len);
if (charlen == 1) {
if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) {
machine_int_t ord = *str++ & 0x7F;
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
ord &= ~mask;
}
while (UTF8_IS_CONT(*str)) {
ord = (ord << 6) | (*str++ & 0x3F);
}
return mp_obj_new_int(ord);
} else {
return mp_obj_new_int(((const byte*)str)[0]);
}
} else {
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", len));
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", charlen));
}
}

Expand Down
29 changes: 24 additions & 5 deletions py/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -502,19 +502,32 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
case 'v': c = 0x0b; break;
case 'f': c = 0x0c; break;
case 'r': c = 0x0d; break;
case 'u':
case 'U':
if (is_bytes) {
// b'\u1234' == b'\\u1234'
vstr_add_char(&lex->vstr, '\\');
break;
}
// Otherwise fall through.
case 'x':
{
uint num = 0;
if (!get_hex(lex, 2, &num)) {
if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
// TODO error message
assert(0);
}
c = num;
break;
}
case 'N': break; // TODO \N{name} only in strings
case 'u': break; // TODO \uxxxx only in strings
case 'U': break; // TODO \Uxxxxxxxx only in strings
case 'N':
// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
// 3MB of text; even gzip-compressed and with minimal structure, it'll take
// roughly half a meg of storage. This form of Unicode escape may be added
// later on, but it's definitely not a priority right now. -- CJA 20140607
assert(!"Unicode name escapes not supported");
break;
default:
if (c >= '0' && c <= '7') {
// Octal sequence, 1-3 chars
Expand All @@ -533,7 +546,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
}
}
if (c != MP_LEXER_CHAR_EOF) {
vstr_add_char(&lex->vstr, c);
if (c < 0x110000 && !is_bytes) {
vstr_add_char(&lex->vstr, c);
} else if (c < 0x100 && is_bytes) {
vstr_add_byte(&lex->vstr, c);
} else {
assert(!"TODO: Throw an error, invalid escape code probably");
}
}
} else {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Expand Down
1 change: 1 addition & 0 deletions py/makeqstrdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
codepoint2name[ord('*')] = 'star'

# this must match the equivalent function in qstr.c
# Note that this hashes the UTF-8 encoded data bytes.
def compute_hash(qstr):
hash = 5381
for char in qstr:
Expand Down
3 changes: 3 additions & 0 deletions py/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ bool unichar_isupper(unichar c);
bool unichar_islower(unichar c);
unichar unichar_tolower(unichar c);
unichar unichar_toupper(unichar c);
uint unichar_charlen(const char *str, uint len);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be called utf8_charlen?

#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)

/** variable string *********************************************/

Expand Down
2 changes: 1 addition & 1 deletion py/obj.h
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway conve
const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len);
mp_obj_t mp_obj_str_intern(mp_obj_t str);
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len);
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes);

#if MICROPY_PY_BUILTINS_FLOAT
// float
Expand Down
2 changes: 1 addition & 1 deletion py/objarray.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ STATIC void array_print(void (*print)(void *env, const char *fmt, ...), void *en
mp_obj_array_t *o = o_in;
if (o->typecode == BYTEARRAY_TYPECODE) {
print(env, "bytearray(b", o->typecode);
mp_str_print_quoted(print, env, o->items, o->len);
mp_str_print_quoted(print, env, o->items, o->len, true);
} else {
print(env, "array('%c'", o->typecode);
if (o->len > 0) {
Expand Down
144 changes: 125 additions & 19 deletions py/objstr.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ STATIC bool is_str_or_bytes(mp_obj_t o) {
/******************************************************************************/
/* str */

void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes) {
// this escapes characters, but it will be very slow to print (calling print many times)
bool has_single_quote = false;
bool has_double_quote = false;
Expand All @@ -80,21 +80,33 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
quote_char = '"';
}
print(env, "%c", quote_char);
for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
if (*s == quote_char) {
const char *s = (const char *)str_data, *top = (const char *)str_data + str_len;
while (s < top) {
unichar ch;
if (is_bytes) {
ch = *(unsigned char *)s++; // Don't sign-extend bytes
} else {
ch = utf8_get_char(s);
s = utf8_next_char(s);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be better to have a combined utf8_get_and_next_char() function?

}
if (ch == quote_char) {
print(env, "\\%c", quote_char);
} else if (*s == '\\') {
} else if (ch == '\\') {
print(env, "\\\\");
} else if (32 <= *s && *s <= 126) {
print(env, "%c", *s);
} else if (*s == '\n') {
} else if (32 <= ch && ch <= 126) {
print(env, "%c", ch);
} else if (ch == '\n') {
print(env, "\\n");
} else if (*s == '\r') {
} else if (ch == '\r') {
print(env, "\\r");
} else if (*s == '\t') {
} else if (ch == '\t') {
print(env, "\\t");
} else if (ch < 0x100) {
print(env, "\\x%02x", ch);
} else if (ch < 0x10000) {
print(env, "\\u%04x", ch);
} else {
print(env, "\\x%02x", *s);
print(env, "\\U%08x", ch);
}
}
print(env, "%c", quote_char);
Expand All @@ -109,7 +121,7 @@ STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env,
if (is_bytes) {
print(env, "b");
}
mp_str_print_quoted(print, env, str_data, str_len);
mp_str_print_quoted(print, env, str_data, str_len, is_bytes);
}
}

Expand Down Expand Up @@ -348,27 +360,121 @@ STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
return MP_OBJ_NULL; // op not supported
}

// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
// be capped to the first/last character of the string, depending on is_slice.
STATIC const char *str_index_to_ptr(const char *self_data, uint self_len, mp_obj_t index, bool is_slice) {
machine_int_t i;
// Copied from mp_get_index; I don't want bounds checking, just give me
// the integer as-is. (I can't bounds-check without scanning the whole
// string; an out-of-bounds index will be caught in the loops below.)
if (MP_OBJ_IS_SMALL_INT(index)) {
i = MP_OBJ_SMALL_INT_VALUE(index);
} else if (!mp_obj_get_int_maybe(index, &i)) {
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
}
const char *s, *top = self_data + self_len;
if (i < 0)
{
// Negative indexing is performed by counting from the end of the string.
for (s = top - 1; i; --s) {
if (s < self_data) {
if (is_slice) {
return self_data;
}
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
}
if (!UTF8_IS_CONT(*s)) {
++i;
}
}
++s;
} else if (!i) {
return self_data; // Shortcut - str[0] is its base pointer
} else {
// Positive indexing, correspondingly, counts from the start of the string.
// It's assumed that negative indexing will generally be used with small
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this comment (the bit about negative indexing) belong in the if (i < 0) { statement above?

// absolute values (eg str[-1], not str[-1000000]), which means it'll be
// more efficient this way.
for (s = self_data; true; ++s) {
if (s >= top) {
if (is_slice) {
return top;
}
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
}
while (UTF8_IS_CONT(*s)) {
++s;
}
if (!i--) {
return s;
}
}
}
return s;
}

STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
mp_obj_type_t *type = mp_obj_get_type(self_in);
GET_STR_DATA_LEN(self_in, self_data, self_len);
if (value == MP_OBJ_SENTINEL) {
// load
#if MICROPY_PY_BUILTINS_SLICE
if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
mp_bound_slice_t slice;
if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) {
mp_obj_t ostart, ostop, ostep;
mp_obj_slice_get(index, &ostart, &ostop, &ostep);
if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
nlr_raise(mp_obj_new_exception_msg(&mp_type_NotImplementedError,
"only slices with step=1 (aka None) are supported"));
}
return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);

if (type == &mp_type_bytes) {
machine_int_t start = 0, stop = self_len;
if (ostart != mp_const_none) {
start = MP_OBJ_SMALL_INT_VALUE(ostart);
if (start < 0) {
start = self_len + start;
}
}
if (ostop != mp_const_none) {
stop = MP_OBJ_SMALL_INT_VALUE(ostop);
if (stop < 0) {
stop = self_len + stop;
}
}
return mp_obj_new_str_of_type(type, self_data + start, stop - start);
}
const char *pstart, *pstop;
if (ostart != mp_const_none) {
pstart = str_index_to_ptr((const char *)self_data, self_len, ostart, true);
} else {
pstart = (const char *)self_data;
}
if (ostop != mp_const_none) {
// pstop will point just after the stop character. This depends on
// the \0 at the end of the string.
pstop = str_index_to_ptr((const char *)self_data, self_len, ostop, true);
} else {
pstop = (const char *)self_data + self_len;
}
if (pstop < pstart) {
return MP_OBJ_NEW_QSTR(MP_QSTR_);
}
return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
}
#endif
uint index_val = mp_get_index(type, self_len, index, false);
if (type == &mp_type_bytes) {
uint index_val = mp_get_index(type, self_len, index, false);
return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
} else {
return mp_obj_new_str((char*)self_data + index_val, 1, true);
}
const char *s = str_index_to_ptr((const char *)self_data, self_len, index, false);
int len = 1;
if (UTF8_IS_NONASCII(*s)) {
// Count the number of 1 bits (after the first)
for (char mask = 0x40; *s & mask; mask >>= 1) {
++len;
}
}
return mp_obj_new_str(s, len, true); // This will create a one-character string
} else {
return MP_OBJ_NULL; // op not supported
}
Expand Down Expand Up @@ -1800,8 +1906,8 @@ uint mp_obj_str_get_hash(mp_obj_t self_in) {
uint mp_obj_str_get_len(mp_obj_t self_in) {
// TODO This has a double check for the type, one in obj.c and one here
if (MP_OBJ_IS_STR(self_in) || MP_OBJ_IS_TYPE(self_in, &mp_type_bytes)) {
GET_STR_LEN(self_in, l);
return l;
GET_STR_DATA_LEN(self_in, self_data, self_len);
return unichar_charlen((const char *)self_data, self_len);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn' this be self_len if type is mp_type_bytes?

} else {
bad_implicit_conversion(self_in);
}
Expand Down
2 changes: 1 addition & 1 deletion py/objstr.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ typedef struct _mp_obj_str_t {
machine_uint_t hash : 16;
// len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
machine_uint_t len : 16;
const byte *data;
const void *data; //Character data is encoded UTF-8 and should not be blindly indexed.
} mp_obj_str_t;

#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, (const byte*)str};
Expand Down
4 changes: 3 additions & 1 deletion py/qstr.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#define Q_GET_DATA(q) ((q) + 4)

// this must match the equivalent function in makeqstrdata.py
// Note that this hashes the UTF-8 encoded data bytes.
machine_uint_t qstr_compute_hash(const byte *data, uint len) {
// djb2 algorithm; see http://www.cse.yorku.ca/~oz/hash.html
machine_uint_t hash = 5381;
Expand Down Expand Up @@ -179,7 +180,8 @@ qstr qstr_build_end(byte *q_ptr) {
qstr q = qstr_find_strn((const char*)Q_GET_DATA(q_ptr), Q_GET_LENGTH(q_ptr));
if (q == 0) {
machine_uint_t len = Q_GET_LENGTH(q_ptr);
machine_uint_t hash = qstr_compute_hash(Q_GET_DATA(q_ptr), len);
const byte *str = Q_GET_DATA(q_ptr);
machine_uint_t hash = qstr_compute_hash(str, len);
q_ptr[0] = hash;
q_ptr[1] = hash >> 8;
q_ptr[4 + len] = '\0';
Expand Down
Loading