@@ -360,6 +360,51 @@ STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
360
360
return MP_OBJ_NULL ; // op not supported
361
361
}
362
362
363
+ // Convert an index into a pointer to its lead byte, or raise IndexError if out of bounds
364
+ STATIC const char * str_index_to_ptr (const char * self_data , uint self_len , mp_obj_t index ) {
365
+ machine_int_t i ;
366
+ // Copied from mp_get_index; I don't want bounds checking, just give me
367
+ // the integer as-is. (I can't bounds-check without scanning the whole
368
+ // string; an out-of-bounds index will be caught in the loops below.)
369
+ if (MP_OBJ_IS_SMALL_INT (index )) {
370
+ i = MP_OBJ_SMALL_INT_VALUE (index );
371
+ } else if (!mp_obj_get_int_maybe (index , & i )) {
372
+ nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_TypeError , "string indices must be integers, not %s" , mp_obj_get_type_str (index )));
373
+ }
374
+ const char * s , * top = self_data + self_len ;
375
+ if (i < 0 )
376
+ {
377
+ // Negative indexing is performed by counting from the end of the string.
378
+ for (s = top - 1 ; i ; -- s ) {
379
+ if (s < self_data ) {
380
+ nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_IndexError , "string index out of range" ));
381
+ }
382
+ if (!UTF8_IS_CONT (* s )) {
383
+ ++ i ;
384
+ }
385
+ }
386
+ ++ s ;
387
+ } else {
388
+ // Positive indexing, correspondingly, counts from the start of the string.
389
+ // It's assumed that negative indexing will generally be used with small
390
+ // absolute values (eg str[-1], not str[-1000000]), which means it'll be
391
+ // more efficient this way.
392
+ for (s = self_data ; i ; ++ s ) {
393
+ if (s >= top ) {
394
+ nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_IndexError , "string index out of range" ));
395
+ }
396
+ if (!UTF8_IS_CONT (* s )) {
397
+ -- i ;
398
+ }
399
+ }
400
+ // Skip continuation bytes after the last lead byte
401
+ while (UTF8_IS_CONT (* s )) {
402
+ ++ s ;
403
+ }
404
+ }
405
+ return s ;
406
+ }
407
+
363
408
STATIC mp_obj_t str_subscr (mp_obj_t self_in , mp_obj_t index , mp_obj_t value ) {
364
409
mp_obj_type_t * type = mp_obj_get_type (self_in );
365
410
GET_STR_DATA_LEN (self_in , self_data , self_len );
@@ -379,46 +424,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
379
424
uint index_val = mp_get_index (type , self_len , index , false);
380
425
return MP_OBJ_NEW_SMALL_INT ((mp_small_int_t )self_data [index_val ]);
381
426
}
382
- const char * s , * top = (const char * )self_data + self_len ;
383
- machine_int_t i ;
384
- // Copied from mp_get_index; I don't want bounds checking, just give me
385
- // the integer as-is. (I can't bounds-check without scanning the whole
386
- // string; an out-of-bounds index will be caught in the loops below.)
387
- if (MP_OBJ_IS_SMALL_INT (index )) {
388
- i = MP_OBJ_SMALL_INT_VALUE (index );
389
- } else if (!mp_obj_get_int_maybe (index , & i )) {
390
- nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_TypeError , "%s indices must be integers, not %s" , qstr_str (type -> name ), mp_obj_get_type_str (index )));
391
- }
392
- if (i < 0 )
393
- {
394
- // Negative indexing is performed by counting from the end of the string.
395
- for (s = top - 1 ; i ; -- s ) {
396
- if (s < (const char * )self_data ) {
397
- nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_IndexError , "string index out of range" ));
398
- }
399
- if (!UTF8_IS_CONT (* s )) {
400
- ++ i ;
401
- }
402
- }
403
- ++ s ;
404
- } else {
405
- // Positive indexing, correspondingly, counts from the start of the string.
406
- // It's assumed that negative indexing will generally be used with small
407
- // absolute values (eg str[-1], not str[-1000000]), which means it'll be
408
- // more efficient this way.
409
- for (s = (const char * )self_data ; i ; ++ s ) {
410
- if (s >= top ) {
411
- nlr_raise (mp_obj_new_exception_msg_varg (& mp_type_IndexError , "string index out of range" ));
412
- }
413
- if (!UTF8_IS_CONT (* s )) {
414
- -- i ;
415
- }
416
- }
417
- // Skip continuation bytes after the last lead byte
418
- while (UTF8_IS_CONT (* s )) {
419
- ++ s ;
420
- }
421
- }
427
+ const char * s = str_index_to_ptr ((const char * )self_data , self_len , index );
422
428
int len = 1 ;
423
429
if (UTF8_IS_NONASCII (* s )) {
424
430
// Count the number of 1 bits (after the first)
0 commit comments