Skip to content

Commit 24371c7

Browse files
committed
Break out index-to-pointer calculation into a function
1 parent 616c24a commit 24371c7

File tree

1 file changed

+46
-40
lines changed

1 file changed

+46
-40
lines changed

py/objstr.c

Lines changed: 46 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,51 @@ STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
360360
return MP_OBJ_NULL; // op not supported
361361
}
362362

363+
// Convert an index into a pointer to its lead byte, or raise IndexError if out of bounds
364+
STATIC const char *str_index_to_ptr(const char *self_data, uint self_len, mp_obj_t index) {
365+
machine_int_t i;
366+
// Copied from mp_get_index; I don't want bounds checking, just give me
367+
// the integer as-is. (I can't bounds-check without scanning the whole
368+
// string; an out-of-bounds index will be caught in the loops below.)
369+
if (MP_OBJ_IS_SMALL_INT(index)) {
370+
i = MP_OBJ_SMALL_INT_VALUE(index);
371+
} else if (!mp_obj_get_int_maybe(index, &i)) {
372+
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
373+
}
374+
const char *s, *top = self_data + self_len;
375+
if (i < 0)
376+
{
377+
// Negative indexing is performed by counting from the end of the string.
378+
for (s = top - 1; i; --s) {
379+
if (s < self_data) {
380+
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
381+
}
382+
if (!UTF8_IS_CONT(*s)) {
383+
++i;
384+
}
385+
}
386+
++s;
387+
} else {
388+
// Positive indexing, correspondingly, counts from the start of the string.
389+
// It's assumed that negative indexing will generally be used with small
390+
// absolute values (eg str[-1], not str[-1000000]), which means it'll be
391+
// more efficient this way.
392+
for (s = self_data; i; ++s) {
393+
if (s >= top) {
394+
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
395+
}
396+
if (!UTF8_IS_CONT(*s)) {
397+
--i;
398+
}
399+
}
400+
// Skip continuation bytes after the last lead byte
401+
while (UTF8_IS_CONT(*s)) {
402+
++s;
403+
}
404+
}
405+
return s;
406+
}
407+
363408
STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
364409
mp_obj_type_t *type = mp_obj_get_type(self_in);
365410
GET_STR_DATA_LEN(self_in, self_data, self_len);
@@ -379,46 +424,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
379424
uint index_val = mp_get_index(type, self_len, index, false);
380425
return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
381426
}
382-
const char *s, *top = (const char *)self_data + self_len;
383-
machine_int_t i;
384-
// Copied from mp_get_index; I don't want bounds checking, just give me
385-
// the integer as-is. (I can't bounds-check without scanning the whole
386-
// string; an out-of-bounds index will be caught in the loops below.)
387-
if (MP_OBJ_IS_SMALL_INT(index)) {
388-
i = MP_OBJ_SMALL_INT_VALUE(index);
389-
} else if (!mp_obj_get_int_maybe(index, &i)) {
390-
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "%s indices must be integers, not %s", qstr_str(type->name), mp_obj_get_type_str(index)));
391-
}
392-
if (i < 0)
393-
{
394-
// Negative indexing is performed by counting from the end of the string.
395-
for (s = top - 1; i; --s) {
396-
if (s < (const char *)self_data) {
397-
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
398-
}
399-
if (!UTF8_IS_CONT(*s)) {
400-
++i;
401-
}
402-
}
403-
++s;
404-
} else {
405-
// Positive indexing, correspondingly, counts from the start of the string.
406-
// It's assumed that negative indexing will generally be used with small
407-
// absolute values (eg str[-1], not str[-1000000]), which means it'll be
408-
// more efficient this way.
409-
for (s = (const char *)self_data; i; ++s) {
410-
if (s >= top) {
411-
nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
412-
}
413-
if (!UTF8_IS_CONT(*s)) {
414-
--i;
415-
}
416-
}
417-
// Skip continuation bytes after the last lead byte
418-
while (UTF8_IS_CONT(*s)) {
419-
++s;
420-
}
421-
}
427+
const char *s = str_index_to_ptr((const char *)self_data, self_len, index);
422428
int len = 1;
423429
if (UTF8_IS_NONASCII(*s)) {
424430
// Count the number of 1 bits (after the first)

0 commit comments

Comments
 (0)