Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
12bc13e
mpconfig.h: Add MICROPY_PY_BUILTINS_STR_UNICODE.
pfalcon Jun 12, 2014
c88987c
py: Implement basic unicode functions.
Rosuav Jun 3, 2014
8386534
objstrunicode: Complete copy of objstr, to be patched for unicode sup…
pfalcon Jun 12, 2014
64b468d
objstrunicode: Basic implementation of unicode handling.
Rosuav Jun 3, 2014
9a1a4be
builtin: ord, chr: Unicode support.
Rosuav Jun 3, 2014
1e3781b
tests: Add unicode test.
Rosuav Jun 3, 2014
2ba2299
lexer, vstr: Add unicode support.
Rosuav Jun 3, 2014
42a5251
builtin: Restore bytestr compatibility.
pfalcon Jun 12, 2014
165eb69
vstr: Restore bytestr compatibility.
pfalcon Jun 12, 2014
9731912
py: Prune unneeded code from objstrunicode, reuse code in objstr.
pfalcon Jun 13, 2014
d215ee1
py: Make MICROPY_PY_BUILTINS_STR_UNICODE=1 buildable.
pfalcon Jun 13, 2014
86d3898
objstrunicode: Get rid of bytes checking, it's separate type.
pfalcon Jun 13, 2014
e7f2b4c
objstrunicode: Revamp len() handling for unicode, and optimize bool().
pfalcon Jun 13, 2014
cdc020d
objstrunicode: Re-add buffer protocol back for now, required for io.S…
pfalcon Jun 13, 2014
79b7fe2
objstrunicode: Implement iterator.
pfalcon Jun 13, 2014
17994d1
tests: Add test for unicode string iteration.
pfalcon Jun 13, 2014
ded0fc7
py: Add dedicated unicode header.
pfalcon Jun 14, 2014
46d31e9
unicode: Add utf8_ptr_to_index().
pfalcon Jun 14, 2014
5048df0
objstr: find(), rfind(), index(): Make return value be unicode-aware.
pfalcon Jun 14, 2014
b1949e4
tests: Add tests for unicode find()/rfind()/index().
pfalcon Jun 14, 2014
1044c3d
unicode: Make get_char()/next_char()/charlen() be 8-bit compatible.
pfalcon Jun 14, 2014
00c904b
objstrunicode: Signedness issues.
pfalcon Jun 14, 2014
26fda6d
objstr: 64-bit issues.
pfalcon Jun 14, 2014
ea2c936
objstrunicode: Refactor str_index_to_ptr() following objstr.
pfalcon Jun 14, 2014
63143c9
tests: Test for explicit start/end args to str methods for unicode.
pfalcon Jun 14, 2014
ce81312
misc: Add count_lead_ones() function, useful for UTF-8 handling.
pfalcon Jun 15, 2014
f5f6c3b
streams: Reading by char count from unicode text streams is not imple…
pfalcon Jun 15, 2014
ed07d03
tests: Add basic test for unicode file i/o.
pfalcon Jun 15, 2014
b3a50f0
Merge branch 'master' into unicode
dpgeorge Jun 28, 2014
e04a44e
py: Small comments, name changes, use of machine_int_t.
dpgeorge Jun 28, 2014
41736f8
tests: Write output in byte mode, not text mode.
dpgeorge Jun 28, 2014
8546ce1
py: Add missing #endif.
dpgeorge Jun 28, 2014
635b60e
unix, stmhal: Add option for STR_UNICODE to mpconfigport.h.
dpgeorge Jun 28, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 48 additions & 2 deletions py/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,40 @@ STATIC mp_obj_t mp_builtin_callable(mp_obj_t o_in) {
MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_callable_obj, mp_builtin_callable);

STATIC mp_obj_t mp_builtin_chr(mp_obj_t o_in) {
int ord = mp_obj_get_int(o_in);
#if MICROPY_PY_BUILTINS_STR_UNICODE
machine_int_t c = mp_obj_get_int(o_in);
char str[4];
int len = 0;
if (c < 0x80) {
*str = c; len = 1;
} else if (c < 0x800) {
str[0] = (c >> 6) | 0xC0;
str[1] = (c & 0x3F) | 0x80;
len = 2;
} else if (c < 0x10000) {
str[0] = (c >> 12) | 0xE0;
str[1] = ((c >> 6) & 0x3F) | 0x80;
str[2] = (c & 0x3F) | 0x80;
len = 3;
} else if (c < 0x110000) {
str[0] = (c >> 18) | 0xF0;
str[1] = ((c >> 12) & 0x3F) | 0x80;
str[2] = ((c >> 6) & 0x3F) | 0x80;
str[3] = (c & 0x3F) | 0x80;
len = 4;
} else {
nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)"));
}
return mp_obj_new_str(str, len, true);
#else
machine_int_t ord = mp_obj_get_int(o_in);
if (0 <= ord && ord <= 0x10ffff) {
char str[1] = {ord};
return mp_obj_new_str(str, 1, true);
} else {
nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)"));
}
#endif
}

MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_chr_obj, mp_builtin_chr);
Expand Down Expand Up @@ -344,13 +371,32 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_oct_obj, mp_builtin_oct);
STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) {
uint len;
const char *str = mp_obj_str_get_data(o_in, &len);
#if MICROPY_PY_BUILTINS_STR_UNICODE
uint charlen = unichar_charlen(str, len);
if (charlen == 1) {
if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) {
machine_int_t ord = *str++ & 0x7F;
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
ord &= ~mask;
}
while (UTF8_IS_CONT(*str)) {
ord = (ord << 6) | (*str++ & 0x3F);
}
return mp_obj_new_int(ord);
} else {
return mp_obj_new_int(((const byte*)str)[0]);
}
} else {
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", charlen));
}
#else
if (len == 1) {
// don't sign extend when converting to ord
// TODO unicode
return mp_obj_new_int(((const byte*)str)[0]);
} else {
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", len));
}
#endif
}

MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_ord_obj, mp_builtin_ord);
Expand Down
29 changes: 24 additions & 5 deletions py/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -502,19 +502,32 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
case 'v': c = 0x0b; break;
case 'f': c = 0x0c; break;
case 'r': c = 0x0d; break;
case 'u':
case 'U':
if (is_bytes) {
// b'\u1234' == b'\\u1234'
vstr_add_char(&lex->vstr, '\\');
break;
}
// Otherwise fall through.
case 'x':
{
uint num = 0;
if (!get_hex(lex, 2, &num)) {
if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
// TODO error message
assert(0);
}
c = num;
break;
}
case 'N': break; // TODO \N{name} only in strings
case 'u': break; // TODO \uxxxx only in strings
case 'U': break; // TODO \Uxxxxxxxx only in strings
case 'N':
// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
// 3MB of text; even gzip-compressed and with minimal structure, it'll take
// roughly half a meg of storage. This form of Unicode escape may be added
// later on, but it's definitely not a priority right now. -- CJA 20140607
assert(!"Unicode name escapes not supported");
break;
default:
if (c >= '0' && c <= '7') {
// Octal sequence, 1-3 chars
Expand All @@ -533,7 +546,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
}
}
if (c != MP_LEXER_CHAR_EOF) {
vstr_add_char(&lex->vstr, c);
if (c < 0x110000 && !is_bytes) {
vstr_add_char(&lex->vstr, c);
} else if (c < 0x100 && is_bytes) {
vstr_add_byte(&lex->vstr, c);
} else {
assert(!"TODO: Throw an error, invalid escape code probably");
}
}
} else {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Expand Down
18 changes: 17 additions & 1 deletion py/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ bool unichar_isupper(unichar c);
bool unichar_islower(unichar c);
unichar unichar_tolower(unichar c);
unichar unichar_toupper(unichar c);
#define unichar_charlen(s, bytelen) (bytelen)
uint unichar_charlen(const char *str, uint len); // TODO this should return machine_uint_t
#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)

/** variable string *********************************************/

Expand Down Expand Up @@ -164,4 +166,18 @@ int DEBUG_printf(const char *fmt, ...);

extern uint mp_verbose_flag;

// This is useful for unicode handling. Some CPU archs has
// special instructions for efficient implentation of this
// function (e.g. CLZ on ARM).
// NOTE: this function is unused at the moment
#ifndef count_lead_ones
static inline uint count_lead_ones(byte val) {
uint c = 0;
for (byte mask = 0x80; val & mask; mask >>= 1) {
c++;
}
return c;
}
#endif

#endif // _INCLUDED_MINILIB_H
5 changes: 5 additions & 0 deletions py/mpconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,11 @@ typedef double mp_float_t;
/*****************************************************************************/
/* Fine control over Python builtins, classes, modules, etc */

// Whether str object is proper unicode
#ifndef MICROPY_PY_BUILTINS_STR_UNICODE
#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
#endif

// Whether to support bytearray object
#ifndef MICROPY_PY_BUILTINS_BYTEARRAY
#define MICROPY_PY_BUILTINS_BYTEARRAY (1)
Expand Down
7 changes: 6 additions & 1 deletion py/obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,12 @@ uint mp_get_index(const mp_obj_type_t *type, machine_uint_t len, mp_obj_t index,

// may return MP_OBJ_NULL
mp_obj_t mp_obj_len_maybe(mp_obj_t o_in) {
if (MP_OBJ_IS_STR(o_in) || MP_OBJ_IS_TYPE(o_in, &mp_type_bytes)) {
if (
#if !MICROPY_PY_BUILTINS_STR_UNICODE
// It's simple - unicode is slow, non-unicode is fast
MP_OBJ_IS_STR(o_in) ||
#endif
MP_OBJ_IS_TYPE(o_in, &mp_type_bytes)) {
return MP_OBJ_NEW_SMALL_INT((machine_int_t)mp_obj_str_get_len(o_in));
} else {
mp_obj_type_t *type = mp_obj_get_type(o_in);
Expand Down
Loading