Skip to content

Make str and bytes be proper types #355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions py/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -375,28 +375,6 @@ STATIC mp_obj_t mp_builtin_sorted(uint n_args, const mp_obj_t *args, mp_map_t *k

MP_DEFINE_CONST_FUN_OBJ_KW(mp_builtin_sorted_obj, 1, mp_builtin_sorted);

STATIC mp_obj_t mp_builtin_str(mp_obj_t o_in) {
vstr_t *vstr = vstr_new();
mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, o_in, PRINT_STR);
mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
vstr_free(vstr);
return s;
}

MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_str_obj, mp_builtin_str);

// TODO: This should be type, this is just quick CPython compat hack
STATIC mp_obj_t mp_builtin_bytes(uint n_args, const mp_obj_t *args) {
if (!MP_OBJ_IS_QSTR(args[0]) && !MP_OBJ_IS_TYPE(args[0], &str_type)) {
assert(0);
}
// Currently, MicroPython strings are mix between CPython byte and unicode
// strings. So, conversion is null so far.
return args[0];
}

MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mp_builtin_bytes_obj, 1, 3, mp_builtin_bytes);

STATIC mp_obj_t mp_builtin_id(mp_obj_t o_in) {
return mp_obj_new_int((machine_int_t)o_in);
}
Expand Down
136 changes: 127 additions & 9 deletions py/objstr.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ typedef struct _mp_obj_str_t {
mp_obj_base_t base;
machine_uint_t hash : 16; // XXX here we assume the hash size is 16 bits (it is at the moment; see qstr.c)
machine_uint_t len : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
byte data[];
const byte *data;
} mp_obj_str_t;

const mp_obj_t mp_const_empty_bytes;

// use this macro to extract the string hash
#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }

Expand All @@ -28,6 +30,7 @@ typedef struct _mp_obj_str_t {

STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len);

/******************************************************************************/
/* str */
Expand Down Expand Up @@ -78,6 +81,109 @@ STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env,
}
}

STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
switch (n_args) {
case 0:
return MP_OBJ_NEW_QSTR(MP_QSTR_);

case 1:
{
vstr_t *vstr = vstr_new();
mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
mp_obj_t s = mp_obj_new_str((byte*)vstr->buf, vstr->len, false);
vstr_free(vstr);
return s;
}

case 2:
case 3:
{
// TODO: validate 2nd/3rd args
if (!MP_OBJ_IS_TYPE(args[0], &bytes_type)) {
nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
}
GET_STR_DATA_LEN(args[0], str_data, str_len);
GET_STR_HASH(args[0], str_hash);
mp_obj_str_t *o = str_new(&str_type, NULL, str_len);
o->data = str_data;
o->hash = str_hash;
return o;
}

default:
nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
}
}

STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
if (n_args == 0) {
return mp_const_empty_bytes;
}

if (MP_OBJ_IS_STR(args[0])) {
if (n_args < 2 || n_args > 3) {
goto wrong_args;
}
GET_STR_DATA_LEN(args[0], str_data, str_len);
GET_STR_HASH(args[0], str_hash);
mp_obj_str_t *o = str_new(&bytes_type, NULL, str_len);
o->data = str_data;
o->hash = str_hash;
return o;
}

if (n_args > 1) {
goto wrong_args;
}

if (MP_OBJ_IS_SMALL_INT(args[0])) {
uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
byte *data;

mp_obj_t o = mp_obj_str_builder_start(&bytes_type, len, &data);
memset(data, 0, len);
return mp_obj_str_builder_end(o);
}

int len;
byte *data;
vstr_t *vstr = NULL;
mp_obj_t o = NULL;
// Try to create array of exact len if initializer len is known
mp_obj_t len_in = mp_obj_len_maybe(args[0]);
if (len_in == MP_OBJ_NULL) {
len = -1;
vstr = vstr_new();
} else {
len = MP_OBJ_SMALL_INT_VALUE(len_in);
o = mp_obj_str_builder_start(&bytes_type, len, &data);
}

mp_obj_t iterable = rt_getiter(args[0]);
mp_obj_t item;
while ((item = rt_iternext(iterable)) != mp_const_stop_iteration) {
if (len == -1) {
vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
} else {
*data++ = MP_OBJ_SMALL_INT_VALUE(item);
}
}

if (len == -1) {
vstr_shrink(vstr);
// TODO: Optimize, borrow buffer from vstr
len = vstr_len(vstr);
o = mp_obj_str_builder_start(&bytes_type, len, &data);
memcpy(data, vstr_str(vstr), len);
vstr_free(vstr);
}

return mp_obj_str_builder_end(o);

wrong_args:
nlr_jump(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
}

// like strstr but with specified length and allows \0 bytes
// TODO replace with something more efficient/standard
STATIC const byte *find_subbytes(const byte *haystack, uint hlen, const byte *needle, uint nlen) {
Expand Down Expand Up @@ -619,6 +725,7 @@ const mp_obj_type_t str_type = {
{ &mp_type_type },
.name = MP_QSTR_str,
.print = str_print,
.make_new = str_make_new,
.binary_op = str_binary_op,
.getiter = mp_obj_new_str_iterator,
.methods = str_type_methods,
Expand All @@ -630,34 +737,45 @@ const mp_obj_type_t bytes_type = {
{ &mp_type_type },
.name = MP_QSTR_bytes,
.print = str_print,
.make_new = bytes_make_new,
.binary_op = str_binary_op,
.getiter = mp_obj_new_bytes_iterator,
.methods = str_type_methods,
};

// the zero-length bytes
STATIC const mp_obj_str_t empty_bytes_obj = {{&bytes_type}, 0, 0, NULL};
const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;

mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
o->base.type = type;
o->len = len;
*data = o->data;
byte *p = m_new(byte, len + 1);
o->data = p;
*data = p;
return o;
}

mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
assert(MP_OBJ_IS_STR(o_in));
mp_obj_str_t *o = o_in;
o->hash = qstr_compute_hash(o->data, o->len);
o->data[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
byte *p = (byte*)o->data;
p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
return o;
}

STATIC mp_obj_t str_new(const mp_obj_type_t *type, const byte* data, uint len) {
mp_obj_str_t *o = m_new_obj_var(mp_obj_str_t, byte, len + 1);
mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
o->base.type = type;
o->hash = qstr_compute_hash(data, len);
o->len = len;
memcpy(o->data, data, len * sizeof(byte));
o->data[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
if (data) {
o->hash = qstr_compute_hash(data, len);
byte *p = m_new(byte, len + 1);
o->data = p;
memcpy(p, data, len * sizeof(byte));
p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
}
return o;
}

Expand Down
4 changes: 2 additions & 2 deletions py/runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ STATIC const mp_builtin_elem_t builtin_table[] = {

// built-in types
{ MP_QSTR_bool, (mp_obj_t)&bool_type },
{ MP_QSTR_bytes, (mp_obj_t)&bytes_type },
#if MICROPY_ENABLE_FLOAT
{ MP_QSTR_complex, (mp_obj_t)&mp_type_complex },
#endif
Expand All @@ -102,6 +103,7 @@ STATIC const mp_builtin_elem_t builtin_table[] = {
{ MP_QSTR_list, (mp_obj_t)&list_type },
{ MP_QSTR_map, (mp_obj_t)&map_type },
{ MP_QSTR_set, (mp_obj_t)&set_type },
{ MP_QSTR_str, (mp_obj_t)&str_type },
{ MP_QSTR_super, (mp_obj_t)&super_type },
{ MP_QSTR_tuple, (mp_obj_t)&tuple_type },
{ MP_QSTR_type, (mp_obj_t)&mp_type_type },
Expand All @@ -114,7 +116,6 @@ STATIC const mp_builtin_elem_t builtin_table[] = {
{ MP_QSTR_abs, (mp_obj_t)&mp_builtin_abs_obj },
{ MP_QSTR_all, (mp_obj_t)&mp_builtin_all_obj },
{ MP_QSTR_any, (mp_obj_t)&mp_builtin_any_obj },
{ MP_QSTR_bytes, (mp_obj_t)&mp_builtin_bytes_obj },
{ MP_QSTR_callable, (mp_obj_t)&mp_builtin_callable_obj },
{ MP_QSTR_chr, (mp_obj_t)&mp_builtin_chr_obj },
{ MP_QSTR_dir, (mp_obj_t)&mp_builtin_dir_obj },
Expand All @@ -137,7 +138,6 @@ STATIC const mp_builtin_elem_t builtin_table[] = {
{ MP_QSTR_repr, (mp_obj_t)&mp_builtin_repr_obj },
{ MP_QSTR_sorted, (mp_obj_t)&mp_builtin_sorted_obj },
{ MP_QSTR_sum, (mp_obj_t)&mp_builtin_sum_obj },
{ MP_QSTR_str, (mp_obj_t)&mp_builtin_str_obj },
{ MP_QSTR_bytearray, (mp_obj_t)&mp_builtin_bytearray_obj },

// built-in exceptions
Expand Down
28 changes: 28 additions & 0 deletions tests/basics/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,36 @@
print(repr(a))
print(a[0], a[2])
print(a[-1])
print(str(a, "utf-8"))
print(str(a, "utf-8", "ignore"))
try:
str(a, "utf-8", "ignore", "toomuch")
except TypeError:
print("TypeError")

s = 0
for i in a:
s += i
print(s)


print(bytes("abc", "utf-8"))
print(bytes("abc", "utf-8", "replace"))
try:
bytes("abc")
except TypeError:
print("TypeError")
try:
bytes("abc", "utf-8", "replace", "toomuch")
except TypeError:
print("TypeError")

print(bytes(3))

print(bytes([3, 2, 1]))
print(bytes(range(5)))

def gen():
for i in range(4):
yield i
print(bytes(gen()))