Skip to content

Commit 5042bce

Browse files
committed
py: Don't automatically intern strings in parser.
This completes non-automatic interning of strings in the parser, so that doc strings don't take up RAM. It complicates the parser and compiler, and bloats stmhal by about 300 bytes. It's complicated because now there are 2 kinds of parse-nodes that can be strings: interned leaves and non-interned structs.
1 parent 3aaabd1 commit 5042bce

File tree

4 files changed

+121
-70
lines changed

4 files changed

+121
-70
lines changed

py/compile.c

Lines changed: 61 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ typedef enum {
5656
#include "grammar.h"
5757
#undef DEF_RULE
5858
PN_maximum_number_of,
59+
PN_string, // special node for non-interned string
5960
} pn_kind_t;
6061

6162
#define EMIT(fun) (comp->emit_method_table->fun(comp->emit))
@@ -177,6 +178,8 @@ STATIC mp_parse_node_t fold_constants(compiler_t *comp, mp_parse_node_t pn, mp_m
177178
}
178179
break;
179180
#endif
181+
case PN_string:
182+
return pn;
180183
}
181184

182185
// fold arguments
@@ -426,6 +429,9 @@ void compile_generic_all_nodes(compiler_t *comp, mp_parse_node_struct_t *pns) {
426429

427430
#if MICROPY_EMIT_CPYTHON
428431
STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
432+
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
433+
return true;
434+
}
429435
if (!MP_PARSE_NODE_IS_LEAF(pn)) {
430436
return false;
431437
}
@@ -435,9 +441,7 @@ STATIC bool cpython_c_tuple_is_const(mp_parse_node_t pn) {
435441
return true;
436442
}
437443

438-
STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) {
439-
uint len;
440-
const byte *str = qstr_data(qstr, &len);
444+
STATIC void cpython_c_print_quoted_str(vstr_t *vstr, const char *str, uint len, bool bytes) {
441445
bool has_single_quote = false;
442446
bool has_double_quote = false;
443447
for (int i = 0; i < len; i++) {
@@ -476,6 +480,12 @@ STATIC void cpython_c_print_quoted_str(vstr_t *vstr, qstr qstr, bool bytes) {
476480
}
477481

478482
STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vstr_t *vstr) {
483+
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_string)) {
484+
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
485+
cpython_c_print_quoted_str(vstr, (const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1], false);
486+
return;
487+
}
488+
479489
assert(MP_PARSE_NODE_IS_LEAF(pn));
480490
if (MP_PARSE_NODE_IS_SMALL_INT(pn)) {
481491
vstr_printf(vstr, INT_FMT, MP_PARSE_NODE_LEAF_SMALL_INT(pn));
@@ -487,8 +497,13 @@ STATIC void cpython_c_tuple_emit_const(compiler_t *comp, mp_parse_node_t pn, vst
487497
case MP_PARSE_NODE_ID: assert(0);
488498
case MP_PARSE_NODE_INTEGER: vstr_printf(vstr, "%s", qstr_str(arg)); break;
489499
case MP_PARSE_NODE_DECIMAL: vstr_printf(vstr, "%s", qstr_str(arg)); break;
490-
case MP_PARSE_NODE_STRING: cpython_c_print_quoted_str(vstr, arg, false); break;
491-
case MP_PARSE_NODE_BYTES: cpython_c_print_quoted_str(vstr, arg, true); break;
500+
case MP_PARSE_NODE_STRING:
501+
case MP_PARSE_NODE_BYTES: {
502+
uint len;
503+
const byte *str = qstr_data(arg, &len);
504+
cpython_c_print_quoted_str(vstr, (const char*)str, len, MP_PARSE_NODE_LEAF_KIND(pn) == MP_PARSE_NODE_BYTES);
505+
break;
506+
}
492507
case MP_PARSE_NODE_TOKEN:
493508
switch (arg) {
494509
case MP_TOKEN_KW_FALSE: vstr_printf(vstr, "False"); break;
@@ -2058,7 +2073,8 @@ void compile_expr_stmt(compiler_t *comp, mp_parse_node_struct_t *pns) {
20582073

20592074
} else {
20602075
// for non-REPL, evaluate then discard the expression
2061-
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0])) {
2076+
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0]) && !MP_PARSE_NODE_IS_ID(pns->nodes[0]))
2077+
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
20622078
// do nothing with a lonely constant
20632079
} else {
20642080
compile_node(comp, pns->nodes[0]); // just an expression
@@ -2498,26 +2514,40 @@ void compile_atom_string(compiler_t *comp, mp_parse_node_struct_t *pns) {
24982514
int n_bytes = 0;
24992515
int string_kind = MP_PARSE_NODE_NULL;
25002516
for (int i = 0; i < n; i++) {
2501-
assert(MP_PARSE_NODE_IS_LEAF(pns->nodes[i]));
2502-
int pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
2503-
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
2517+
int pn_kind;
2518+
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
2519+
pn_kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[i]);
2520+
assert(pn_kind == MP_PARSE_NODE_STRING || pn_kind == MP_PARSE_NODE_BYTES);
2521+
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
2522+
} else {
2523+
assert(MP_PARSE_NODE_IS_STRUCT(pns->nodes[i]));
2524+
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
2525+
assert(MP_PARSE_NODE_STRUCT_KIND(pns_string) == PN_string);
2526+
pn_kind = MP_PARSE_NODE_STRING;
2527+
n_bytes += (machine_uint_t)pns_string->nodes[1];
2528+
}
25042529
if (i == 0) {
25052530
string_kind = pn_kind;
25062531
} else if (pn_kind != string_kind) {
25072532
compile_syntax_error(comp, (mp_parse_node_t)pns, "cannot mix bytes and nonbytes literals");
25082533
return;
25092534
}
2510-
n_bytes += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
25112535
}
25122536

25132537
// concatenate string/bytes
25142538
byte *q_ptr;
25152539
byte *s_dest = qstr_build_start(n_bytes, &q_ptr);
25162540
for (int i = 0; i < n; i++) {
2517-
uint s_len;
2518-
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
2519-
memcpy(s_dest, s, s_len);
2520-
s_dest += s_len;
2541+
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[i])) {
2542+
uint s_len;
2543+
const byte *s = qstr_data(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), &s_len);
2544+
memcpy(s_dest, s, s_len);
2545+
s_dest += s_len;
2546+
} else {
2547+
mp_parse_node_struct_t *pns_string = (mp_parse_node_struct_t*)pns->nodes[i];
2548+
memcpy(s_dest, (const char*)pns_string->nodes[0], (machine_uint_t)pns_string->nodes[1]);
2549+
s_dest += (machine_uint_t)pns_string->nodes[1];
2550+
}
25212551
}
25222552
qstr q = qstr_build_end(q_ptr);
25232553

@@ -2848,15 +2878,19 @@ void compile_node(compiler_t *comp, mp_parse_node_t pn) {
28482878
} else {
28492879
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
28502880
EMIT_ARG(set_line_number, pns->source_line);
2851-
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
2852-
if (f == NULL) {
2853-
printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns));
2881+
if (MP_PARSE_NODE_STRUCT_KIND(pns) == PN_string) {
2882+
EMIT_ARG(load_const_str, qstr_from_strn((const char*)pns->nodes[0], (machine_uint_t)pns->nodes[1]), false);
2883+
} else {
2884+
compile_function_t f = compile_function[MP_PARSE_NODE_STRUCT_KIND(pns)];
2885+
if (f == NULL) {
2886+
printf("node %u cannot be compiled\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns));
28542887
#if MICROPY_DEBUG_PRINTERS
2855-
mp_parse_node_print(pn, 0);
2888+
mp_parse_node_print(pn, 0);
28562889
#endif
2857-
compile_syntax_error(comp, pn, "internal compiler error");
2858-
} else {
2859-
f(comp, pns);
2890+
compile_syntax_error(comp, pn, "internal compiler error");
2891+
} else {
2892+
f(comp, pns);
2893+
}
28602894
}
28612895
}
28622896
}
@@ -3033,13 +3067,13 @@ STATIC void check_for_doc_string(compiler_t *comp, mp_parse_node_t pn) {
30333067
// check the first statement for a doc string
30343068
if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_expr_stmt)) {
30353069
mp_parse_node_struct_t* pns = (mp_parse_node_struct_t*)pn;
3036-
if (MP_PARSE_NODE_IS_LEAF(pns->nodes[0])) {
3037-
int kind = MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]);
3038-
if (kind == MP_PARSE_NODE_STRING) {
3039-
compile_node(comp, pns->nodes[0]); // a doc string
3040-
// store doc string
3070+
if ((MP_PARSE_NODE_IS_LEAF(pns->nodes[0])
3071+
&& MP_PARSE_NODE_LEAF_KIND(pns->nodes[0]) == MP_PARSE_NODE_STRING)
3072+
|| MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_string)) {
3073+
// compile the doc string
3074+
compile_node(comp, pns->nodes[0]);
3075+
// store the doc string
30413076
EMIT_ARG(store_id, MP_QSTR___doc__);
3042-
}
30433077
}
30443078
}
30453079
#endif

py/mpconfig.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@
6666
#define MICROPY_ALLOC_PARSE_RESULT_INC (16)
6767
#endif
6868

69+
// Strings this length or less will be interned by the parser
70+
#ifndef MICROPY_ALLOC_PARSE_INTERN_STRING_LEN
71+
#define MICROPY_ALLOC_PARSE_INTERN_STRING_LEN (10)
72+
#endif
73+
6974
// Initial amount for ids in a scope
7075
#ifndef MICROPY_ALLOC_SCOPE_ID_INIT
7176
#define MICROPY_ALLOC_SCOPE_ID_INIT (4)

py/parse.c

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#include <stdint.h>
2929
#include <stdio.h>
3030
#include <assert.h>
31-
#include <memory.h>
31+
#include <string.h>
3232

3333
#include "misc.h"
3434
#include "mpconfig.h"
@@ -71,7 +71,7 @@ enum {
7171
#include "grammar.h"
7272
#undef DEF_RULE
7373
RULE_maximum_number_of,
74-
RULE_string,
74+
RULE_string, // special node for non-interned string
7575
};
7676

7777
#define or(n) (RULE_ACT_OR | n)
@@ -172,26 +172,26 @@ mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg) {
172172
return (mp_parse_node_t)(kind | (arg << 5));
173173
}
174174

175-
uint mp_parse_node_free(mp_parse_node_t pn) {
176-
uint cnt = 0;
175+
void mp_parse_node_free(mp_parse_node_t pn) {
177176
if (MP_PARSE_NODE_IS_STRUCT(pn)) {
178177
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
179178
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
180179
uint rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
180+
if (rule_id == RULE_string) {
181+
return;
182+
}
181183
bool adjust = ADD_BLANK_NODE(rule_id);
182184
if (adjust) {
183185
n--;
184186
}
185187
for (uint i = 0; i < n; i++) {
186-
cnt += mp_parse_node_free(pns->nodes[i]);
188+
mp_parse_node_free(pns->nodes[i]);
187189
}
188190
if (adjust) {
189191
n++;
190192
}
191193
m_del_var(mp_parse_node_struct_t, mp_parse_node_t, n, pns);
192-
cnt++;
193194
}
194-
return cnt;
195195
}
196196

197197
#if MICROPY_DEBUG_PRINTERS
@@ -220,19 +220,21 @@ void mp_parse_node_print(mp_parse_node_t pn, int indent) {
220220
case MP_PARSE_NODE_TOKEN: printf("tok(" INT_FMT ")\n", arg); break;
221221
default: assert(0);
222222
}
223-
} else if (MP_PARSE_NODE_IS_STRING(pn)) {
224-
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
225-
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
226223
} else {
224+
// node must be a mp_parse_node_struct_t
227225
mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
228-
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
226+
if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_string) {
227+
printf("literal str(%.*s)\n", (int)pns->nodes[1], (char*)pns->nodes[0]);
228+
} else {
229+
uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
229230
#ifdef USE_RULE_NAME
230-
printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
231+
printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
231232
#else
232-
printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
233+
printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
233234
#endif
234-
for (uint i = 0; i < n; i++) {
235-
mp_parse_node_print(pns->nodes[i], indent + 2);
235+
for (uint i = 0; i < n; i++) {
236+
mp_parse_node_print(pns->nodes[i], indent + 2);
237+
}
236238
}
237239
}
238240
}
@@ -279,7 +281,20 @@ STATIC void push_result_node(parser_t *parser, mp_parse_node_t pn) {
279281
parser->result_stack[parser->result_stack_top++] = pn;
280282
}
281283

282-
STATIC void push_string(parser_t *parser, int src_line, const char *str, uint len);
284+
STATIC void push_result_string(parser_t *parser, int src_line, const char *str, uint len) {
285+
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
286+
if (pn == NULL) {
287+
memory_error(parser);
288+
return;
289+
}
290+
pn->source_line = src_line;
291+
pn->kind_num_nodes = RULE_string | (2 << 8);
292+
char *p = m_new(char, len);
293+
memcpy(p, str, len);
294+
pn->nodes[0] = (machine_int_t)p;
295+
pn->nodes[1] = len;
296+
push_result_node(parser, (mp_parse_node_t)pn);
297+
}
283298

284299
STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
285300
const mp_token_t *tok = mp_lexer_cur(lex);
@@ -326,10 +341,24 @@ STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
326341
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
327342
}
328343
} else if (tok->kind == MP_TOKEN_STRING) {
329-
printf("Pushing string\n");
330-
push_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len);
331-
return;
332-
// pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qstr_from_strn(tok->str, tok->len));
344+
// Don't automatically intern all strings. doc strings (which are usually large)
345+
// will be discarded by the compiler, and so we shouldn't intern them.
346+
qstr qst = MP_QSTR_NULL;
347+
if (tok->len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
348+
// intern short strings
349+
qst = qstr_from_strn(tok->str, tok->len);
350+
} else {
351+
// check if this string is already interned
352+
qst = qstr_find_strn((const byte*)tok->str, tok->len);
353+
}
354+
if (qst != MP_QSTR_NULL) {
355+
// qstr exists, make a leaf node
356+
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
357+
} else {
358+
// not interned, make a node holding a pointer to the string data
359+
push_result_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len);
360+
return;
361+
}
333362
} else if (tok->kind == MP_TOKEN_BYTES) {
334363
pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(tok->str, tok->len));
335364
} else {
@@ -338,21 +367,6 @@ printf("Pushing string\n");
338367
push_result_node(parser, pn);
339368
}
340369

341-
STATIC void push_string(parser_t *parser, int src_line, const char *str, uint len) {
342-
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, 2);
343-
if (pn == NULL) {
344-
memory_error(parser);
345-
return;
346-
}
347-
pn->source_line = src_line;
348-
pn->kind_num_nodes = RULE_string | (2 << 8);
349-
char *p = m_new(char, len);
350-
memcpy(p, str, len);
351-
pn->nodes[0] = (machine_int_t)p;
352-
pn->nodes[1] = len;
353-
push_result_node(parser, (mp_parse_node_t)pn);
354-
}
355-
356370
STATIC void push_result_rule(parser_t *parser, int src_line, const rule_t *rule, int num_args) {
357371
mp_parse_node_struct_t *pn = m_new_obj_var_maybe(mp_parse_node_struct_t, mp_parse_node_t, num_args);
358372
if (pn == NULL) {
@@ -541,14 +555,13 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
541555
}
542556
}
543557

544-
#if 1 && !MICROPY_ENABLE_DOC_STRING
545-
// this code discards lonely statement, such as doc strings
546-
// problem is that doc strings have already been interned, so this doesn't really help reduce RAM usage
558+
#if !MICROPY_EMIT_CPYTHON && !MICROPY_ENABLE_DOC_STRING
559+
// this code discards lonely statements, such as doc strings
547560
if (input_kind != MP_PARSE_SINGLE_INPUT && rule->rule_id == RULE_expr_stmt && peek_result(&parser, 0) == MP_PARSE_NODE_NULL) {
548561
mp_parse_node_t p = peek_result(&parser, 1);
549-
if ((MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) || MP_PARSE_NODE_IS_STRING(p)) {
550-
pop_result(parser);
551-
pop_result(parser);
562+
if ((MP_PARSE_NODE_IS_LEAF(p) && !MP_PARSE_NODE_IS_ID(p)) || MP_PARSE_NODE_IS_STRUCT_KIND(p, RULE_string)) {
563+
pop_result(&parser);
564+
pop_result(&parser);
552565
push_result_rule(&parser, rule_src_line, rules[RULE_pass_stmt], 0);
553566
break;
554567
}

py/parse.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,9 @@ typedef struct _mp_parse_node_struct_t {
8080
#define MP_PARSE_NODE_LEAF_SMALL_INT(pn) (((machine_int_t)(pn)) >> 1)
8181
#define MP_PARSE_NODE_STRUCT_KIND(pns) ((pns)->kind_num_nodes & 0xff)
8282
#define MP_PARSE_NODE_STRUCT_NUM_NODES(pns) ((pns)->kind_num_nodes >> 8)
83-
#define MP_PARSE_NODE_IS_STRING(pn) (MP_PARSE_NODE_STRUCT_KIND((mp_parse_node_struct_t*)pn) == RULE_string)
8483

8584
mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg);
86-
uint mp_parse_node_free(mp_parse_node_t pn);
85+
void mp_parse_node_free(mp_parse_node_t pn);
8786

8887
void mp_parse_node_print(mp_parse_node_t pn, int indent);
8988

0 commit comments

Comments
 (0)