Skip to content

Commit f88fc7b

Browse files
committed
Merge branch 'pfalcon-keep-strings-uninterned'
2 parents 5fd5af9 + 5042bce commit f88fc7b

File tree

4 files changed

+122
-45
lines changed

4 files changed

+122
-45
lines changed

py/compile.c

Lines changed: 61 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ typedef enum {
5656
#include "grammar.h"
5757
#undef DEF_RULE
5858
PN_maximum_number_of,
59+
PN_string, // special node for non-interned string
5960
} pn_kind_t;
6061

6162
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
@@ -177,6 +178,8 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
177178
}
178179
break;
179180
#endif
181+
case PN_string:
182+
return pn;
180183
}
181184

182185
// fold arguments
@@ -426,6 +429,9 @@ void compile_generic_all_nodes(compiler_t *comp, mp_parse_node_struct_t *pns) {
426429

427430
#if MICROPY_EMIT_CPYTHON
428431
STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
432+
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
433+
return true;
434+
}
429435
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
430436
return false;
431437
}
@@ -435,9 +441,7 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
435441
return true;
436442
}
437443

438-
STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) {
439-
uint len;
440-
const byte *str = qstr_data(qstr, &len);
444+
STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len, bool bytes) {
441445
bool has_single_quote = false;
442446
bool has_double_quote = false;
443447
for (int i = 0; i < len; i++) {
@@ -476,6 +480,12 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) {
476480
}
477481

478482
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
483+
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
484+
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
485+
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1], false);
486+
return;
487+
}
488+
479489
assert(MP_PARSE_NODE_IS_LEAF(pn));
480490
if (MP_PARSE_NODE_IS_SMALL_INT(pn)) {
481491
vstr_printf(vstr, INT_FMT, MP_PARSE_NODE_LEAF_SMALL_INT(pn));
@@ -487,8 +497,13 @@ STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vst
487497
case MP_PARSE_NODE_ID: assert(0);
488498
case MP_PARSE_NODE_INTEGER: vstr_printf(vstr, "%s", qstr_str(arg)); break;
489499
case MP_PARSE_NODE_DECIMAL: vstr_printf(vstr, "%s", qstr_str(arg)); break;
490-
case MP_PARSE_NODE_STRING: cpython_c_print_quoted_str(vstr, arg, false); break;
491-
case MP_PARSE_NODE_BYTES: cpython_c_print_quoted_str(vstr, arg, true); break;
500+
case MP_PARSE_NODE_STRING:
501+
case MP_PARSE_NODE_BYTES: {
502+
uint len;
503+
const byte *str = qstr_data(arg, &len);
504+
cpython_c_print_quoted_str(vstr, (const char*)str, len, MP_PARSE_NODE_LEAF_KIND(pn) == MP_PARSE_NODE_BYTES);
505+
break;
506+
}
492507
case MP_PARSE_NODE_TOKEN:
493508
switch (arg) {
494509
case MP_TOKEN_KW_FALSE: vstr_printf(vstr, "False"); break;
@@ -2058,7 +2073,8 @@ void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
20582073

20592074
} else {
20602075
// for non-REPL, evaluate then discard the expression
2061-
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0])) {
2076+
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
2077+
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
20622078
// do nothing with a lonely constant
20632079
} else {
20642080
compile_node(comp, pns->nodes[0]); // just an expression
@@ -2498,26 +2514,40 @@ void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
24982514
int n_bytes = 0;
24992515
int string_kind = MP_PARSE_NODE_NULL;
25002516
for (int i = 0; i < n; i++) {
2501-
assert(MP_PARSE_NODE_IS_LEAF(pns->nodes[i]));
2502-
int pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
2503-
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
2517+
int pn_kind;
2518+
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
2519+
pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
2520+
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
2521+
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
2522+
} else {
2523+
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
2524+
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
2525+
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
2526+
pn_kind = MP_PARSE_NODE_STRING;
2527+
n_bytes += (machine_uint_t)pns_string->nodes[1];
2528+
}
25042529
if (i == 0) {
25052530
string_kind = pn_kind;
25062531
} else if (pn_kind != string_kind) {
25072532
compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals");
25082533
return;
25092534
}
2510-
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
25112535
}
25122536

25132537
// concatenate string/bytes
25142538
byte *q_ptr;
25152539
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
25162540
for (int i = 0; i < n; i++) {
2517-
uint s_len;
2518-
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
2519-
memcpy(s_dest, s, s_len);
2520-
s_dest += s_len;
2541+
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
2542+
uint s_len;
2543+
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
2544+
memcpy(s_dest, s, s_len);
2545+
s_dest += s_len;
2546+
} else {
2547+
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
2548+
memcpy(s_dest, (const char*)pns_string->nodes[0], (machine_uint_t)pns_string->nodes[1]);
2549+
s_dest += (machine_uint_t)pns_string->nodes[1];
2550+
}
25212551
}
25222552
qstr q = qstr_build_end(q_ptr);
25232553

@@ -2848,15 +2878,19 @@ void compile_node(compiler_t *comp, mp_parse_node_t pn) {
28482878
} else {
28492879
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
28502880
EMIT_ARG(set_line_number, pns->source_line);
2851-
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
2852-
if (f == NULL) {
2853-
printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns));
2881+
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
2882+
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1]), false);
2883+
} else {
2884+
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
2885+
if (f == NULL) {
2886+
printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns));
28542887
#if MICROPY_DEBUG_PRINTERS
2855-
mp_parse_node_print(pn, 0);
2888+
mp_parse_node_print(pn, 0);
28562889
#endif
2857-
compile_syntax_error(comp, pn, "internal compiler error");
2858-
} else {
2859-
f(comp, pns);
2890+
compile_syntax_error(comp, pn, "internal compiler error");
2891+
} else {
2892+
f(comp, pns);
2893+
}
28602894
}
28612895
}
28622896
}
@@ -3033,13 +3067,13 @@ STATIC void check_for_doc_string(compiler_t *comp, mp_parse_node_t pn) {
30333067
// check the first statement for a doc string
30343068
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_expr_stmt)) {
30353069
mp_parse_node_struct_t* pns = (mp_parse_node_struct_t*)pn;
3036-
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0])) {
3037-
int kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]);
3038-
if (kind == MP_PARSE_NODE_STRING) {
3039-
compile_node(comp, pns->nodes[0]); // a doc string
3040-
// store doc string
3070+
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0])
3071+
&& MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]) == MP_PARSE_NODE_STRING)
3072+
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
3073+
// compile the doc string
3074+
compile_node(comp, pns->nodes[0]);
3075+
// store the doc string
30413076
EMIT_ARG(store_id, MP_QSTR___doc__);
3042-
}
30433077
}
30443078
}
30453079
#endif

py/mpconfig.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@
6666
#define MICROPY_ALLOC_PARSE_RESULT_INC (16)
6767
#endif
6868

69+
// Strings this length or less will be interned by the parser
70+
#ifndef MICROPY_ALLOC_PARSE_INTERN_STRING_LEN
71+
#define MICROPY_ALLOC_PARSE_INTERN_STRING_LEN (10)
72+
#endif
73+
6974
// Initial amount for ids in a scope
7075
#ifndef MICROPY_ALLOC_SCOPE_ID_INIT
7176
#define MICROPY_ALLOC_SCOPE_ID_INIT (4)

py/parse.c

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <stdint.h>
2929
#include <stdio.h>
3030
#include <assert.h>
31+
#include <string.h>
3132

3233
#include "misc.h"
3334
#include "mpconfig.h"
@@ -70,6 +71,7 @@ enum {
7071
#include "grammar.h"
7172
#undef DEF_RULE
7273
RULE_maximum_number_of,
74+
RULE_string, // special node for non-interned string
7375
};
7476

7577
#define or(n) (RULE_ACT_OR | n)
@@ -170,26 +172,26 @@ mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg) {
170172
return (mp_parse_node_t)(kind | (arg << 5));
171173
}
172174

173-
uint mp_parse_node_free(mp_parse_node_t pn) {
174-
uint cnt = 0;
175+
void mp_parse_node_free(mp_parse_node_t pn) {
175176
if (MP_PARSE_NODE_IS_STRUCT(pn)) {
176177
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
177178
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
178179
uint rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
180+
if (rule_id == RULE_string) {
181+
return;
182+
}
179183
bool adjust = ADD_BLANK_NODE(rule_id);
180184
if (adjust) {
181185
n--;
182186
}
183187
for (uint i = 0; i < n; i++) {
184-
cnt += mp_parse_node_free(pns->nodes[i]);
188+
mp_parse_node_free(pns->nodes[i]);
185189
}
186190
if (adjust) {
187191
n++;
188192
}
189193
m_del_var(mp_parse_node_struct_t, mp_parse_node_t, n, pns);
190-
cnt++;
191194
}
192-
return cnt;
193195
}
194196

195197
#if MICROPY_DEBUG_PRINTERS
@@ -219,15 +221,20 @@ void mp_parse_node_print(mp_parse_node_t pn, int indent) {
219221
default: assert(0);
220222
}
221223
} else {
224+
// node must be a mp_parse_node_struct_t
222225
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
223-
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
226+
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
227+
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
228+
} else {
229+
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
224230
#ifdef USE_RULE_NAME
225-
printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
231+
printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
226232
#else
227-
printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
233+
printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
228234
#endif
229-
for (uint i = 0; i < n; i++) {
230-
mp_parse_node_print(pns->nodes[i], indent + 2);
235+
for (uint i = 0; i < n; i++) {
236+
mp_parse_node_print(pns->nodes[i], indent + 2);
237+
}
231238
}
232239
}
233240
}
@@ -274,6 +281,21 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
274281
parser->result_stack[parser->result_stack_top++] = pn;
275282
}
276283

284+
STATIC void push_result_string(parser_t *parser, int src_line, const char *str, uint len) {
285+
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
286+
if (pn == NULL) {
287+
memory_error(parser);
288+
return;
289+
}
290+
pn->source_line = src_line;
291+
pn->kind_num_nodes = RULE_string | (2 << 8);
292+
char *p = m_new(char, len);
293+
memcpy(p, str, len);
294+
pn->nodes[0] = (machine_int_t)p;
295+
pn->nodes[1] = len;
296+
push_result_node(parser, (mp_parse_node_t)pn);
297+
}
298+
277299
STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
278300
const mp_token_t *tok = mp_lexer_cur(lex);
279301
mp_parse_node_t pn;
@@ -319,7 +341,24 @@ STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
319341
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
320342
}
321343
} else if (tok->kind == MP_TOKEN_STRING) {
322-
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qstr_from_strn(tok->str, tok->len));
344+
// Don't automatically intern all strings. doc strings (which are usually large)
345+
// will be discarded by the compiler, and so we shouldn't intern them.
346+
qstr qst = MP_QSTR_NULL;
347+
if (tok->len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
348+
// intern short strings
349+
qst = qstr_from_strn(tok->str, tok->len);
350+
} else {
351+
// check if this string is already interned
352+
qst = qstr_find_strn((const byte*)tok->str, tok->len);
353+
}
354+
if (qst != MP_QSTR_NULL) {
355+
// qstr exists, make a leaf node
356+
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
357+
} else {
358+
// not interned, make a node holding a pointer to the string data
359+
push_result_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len);
360+
return;
361+
}
323362
} else if (tok->kind == MP_TOKEN_BYTES) {
324363
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(tok->str, tok->len));
325364
} else {
@@ -516,14 +555,13 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
516555
}
517556
}
518557

519-
#if 0 && !MICROPY_ENABLE_DOC_STRING
520-
// this code discards lonely statement, such as doc strings
521-
// problem is that doc strings have already been interned, so this doesn't really help reduce RAM usage
558+
#if !MICROPY_EMIT_CPYTHON && !MICROPY_ENABLE_DOC_STRING
559+
// this code discards lonely statements, such as doc strings
522560
if (input_kind != MP_PARSE_SINGLE_INPUT && rule->rule_id == RULE_expr_stmt && peek_result(&parser, 0) == MP_PARSE_NODE_NULL) {
523561
mp_parse_node_t p = peek_result(&parser, 1);
524-
if (MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) {
525-
pop_result(parser);
526-
pop_result(parser);
562+
if ((MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) || MP_PARSE_NODE_IS_STRUCT_KIND(p, RULE_string)) {
563+
pop_result(&parser);
564+
pop_result(&parser);
527565
push_result_rule(&parser, rule_src_line, rules[RULE_pass_stmt], 0);
528566
break;
529567
}

py/parse.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ typedef struct _mp_parse_node_struct_t {
8282
#define MP_PARSE_NODE_STRUCT_NUM_NODES(pns) ((pns)->kind_num_nodes >> 8)
8383

8484
mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg);
85-
uint mp_parse_node_free(mp_parse_node_t pn);
85+
void mp_parse_node_free(mp_parse_node_t pn);
8686

8787
void mp_parse_node_print(mp_parse_node_t pn, int indent);
8888

0 commit comments

Comments
 (0)