From e78b8735bdd5ac3899529d2149a783d875410e65 Mon Sep 17 00:00:00 2001 From: Marco Paolini Date: Sat, 13 Jul 2019 15:49:55 +0100 Subject: [PATCH 1/4] bpo-37587: Make json.loads faster for long strings Forces the compiler to use a register variable for a tight loop in the hot-path. It also optimizes a condition for the common case strict=true. --- .../Library/2019-07-13-16-02-48.bpo-37587.fd-1aF.rst | 1 + Modules/_json.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-07-13-16-02-48.bpo-37587.fd-1aF.rst diff --git a/Misc/NEWS.d/next/Library/2019-07-13-16-02-48.bpo-37587.fd-1aF.rst b/Misc/NEWS.d/next/Library/2019-07-13-16-02-48.bpo-37587.fd-1aF.rst new file mode 100644 index 00000000000000..80a89feab0ce8f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-07-13-16-02-48.bpo-37587.fd-1aF.rst @@ -0,0 +1 @@ +Make json.loads faster for long strings. (Patch by Marco Paolini) diff --git a/Modules/_json.c b/Modules/_json.c index 38beb6f50d2ec0..3f2ab373919f88 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -434,12 +434,16 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next while (1) { /* Find the end of the string or the next escape */ Py_UCS4 c = 0; - for (next = end; next < len; next++) { - c = PyUnicode_READ(kind, buf, next); - if (c == '"' || c == '\\') { + next = end; + /* This is a tight loop in a hot path so we try to avoid + MOV from the register variable into memory. See bpo-37587 */ + for (Py_UCS4 c_in = 0; next < len; next++) { + c_in = PyUnicode_READ(kind, buf, next); + if (c_in == '"' || c_in == '\\') { + c = c_in; break; } - else if (strict && c <= 0x1f) { + else if (c_in <= 0x1f && strict) { raise_errmsg("Invalid control character at", pystr, next); goto bail; } From fa52a8372571a5f61d30ab3145bf6abb5e693780 Mon Sep 17 00:00:00 2001 From: Marco Paolini Date: Sat, 13 Jul 2019 20:54:13 +0100 Subject: [PATCH 2/4] Mimimize the conditionals and the numer of ops inside the json.loads hot path. This partially reverts the previous commit as we found out the culprit wasn't the MOV but the strictness checks --- Modules/_json.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 3f2ab373919f88..006dcd812ee987 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -434,19 +434,20 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next while (1) { /* Find the end of the string or the next escape */ Py_UCS4 c = 0; - next = end; - /* This is a tight loop in a hot path so we try to avoid - MOV from the register variable into memory. See bpo-37587 */ - for (Py_UCS4 c_in = 0; next < len; next++) { - c_in = PyUnicode_READ(kind, buf, next); - if (c_in == '"' || c_in == '\\') { - c = c_in; + Py_ssize_t invalid = -1; + for (next = end; next < len; next++) { + c = PyUnicode_READ(kind, buf, next); + if (c == '"' || c == '\\') { break; } - else if (c_in <= 0x1f && strict) { - raise_errmsg("Invalid control character at", pystr, next); - goto bail; - } + /* Defer the strict error until outside this (hot) loop. */ + /* See bpo-37587 */ + if (c <= 0x1f && invalid < 0) + invalid = next; + } + if (strict && invalid >= 0) { + raise_errmsg("Invalid control character at", pystr, invalid); + goto bail; } if (!(c == '"' || c == '\\')) { raise_errmsg("Unterminated string starting at", pystr, begin); From 70feba64a61613c1230213f5fb8f6521be6f052d Mon Sep 17 00:00:00 2001 From: Marco Paolini Date: Sat, 13 Jul 2019 21:17:00 +0100 Subject: [PATCH 3/4] Cleanup indentation --- Modules/_json.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 006dcd812ee987..6aa5f7cc3ec03c 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -442,8 +442,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } /* Defer the strict error until outside this (hot) loop. */ /* See bpo-37587 */ - if (c <= 0x1f && invalid < 0) - invalid = next; + if (c <= 0x1f && invalid < 0) { + invalid = next; + } } if (strict && invalid >= 0) { raise_errmsg("Invalid control character at", pystr, invalid); From f4ba2f0e62fc117afdad3fbcd6f94b56246c4c3b Mon Sep 17 00:00:00 2001 From: Marco Paolini Date: Tue, 30 Jul 2019 01:38:10 +0100 Subject: [PATCH 4/4] Revert some changes not relevant for performance --- Modules/_json.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 6aa5f7cc3ec03c..76da1d345e9df4 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -434,22 +434,16 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next while (1) { /* Find the end of the string or the next escape */ Py_UCS4 c = 0; - Py_ssize_t invalid = -1; for (next = end; next < len; next++) { c = PyUnicode_READ(kind, buf, next); if (c == '"' || c == '\\') { break; } - /* Defer the strict error until outside this (hot) loop. */ - /* See bpo-37587 */ - if (c <= 0x1f && invalid < 0) { - invalid = next; + else if (c <= 0x1f && strict) { + raise_errmsg("Invalid control character at", pystr, next); + goto bail; } } - if (strict && invalid >= 0) { - raise_errmsg("Invalid control character at", pystr, invalid); - goto bail; - } if (!(c == '"' || c == '\\')) { raise_errmsg("Unterminated string starting at", pystr, begin); goto bail;