From 035b3e23c7c3f40ebe3423e6fa890184d80e1efa Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Thu, 11 Apr 2024 21:40:12 +0200 Subject: [PATCH 1/9] Improve performance of startswith by eliminating double work in tailmatch --- Objects/unicodeobject.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2c259b7e869efe..f8e487a3c92c0a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9272,24 +9272,23 @@ tailmatch(PyObject *self, else offset = start; - if (PyUnicode_READ(kind_self, data_self, offset) == - PyUnicode_READ(kind_sub, data_sub, 0) && - PyUnicode_READ(kind_self, data_self, offset + end_sub) == - PyUnicode_READ(kind_sub, data_sub, end_sub)) { + int last_character_matches = PyUnicode_READ(kind_self, data_self, offset + end_sub) == + PyUnicode_READ(kind_sub, data_sub, end_sub); + + if (last_character_matches) { + if (end_sub==0) + return 1; /* If both are of the same kind, memcmp is sufficient */ if (kind_self == kind_sub) { - return ! memcmp((char *)data_self + - (offset * PyUnicode_KIND(substring)), - data_sub, - PyUnicode_GET_LENGTH(substring) * - PyUnicode_KIND(substring)); + return ! memcmp((char *)data_self + (offset * kind_sub), + data_sub, end_sub * kind_sub); } /* otherwise we have to compare each character by first accessing it */ else { /* We do not need to compare 0 and len(substring)-1 because the if statement above ensured already that they are equal when we end up here. */ - for (i = 1; i < end_sub; ++i) { + for (i = 0; i < end_sub; ++i) { if (PyUnicode_READ(kind_self, data_self, offset + i) != PyUnicode_READ(kind_sub, data_sub, i)) return 0; From 4f4b084eadd50e65f165ad011777e5a7991ff240 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Thu, 11 Apr 2024 22:57:02 +0200 Subject: [PATCH 2/9] code style --- Objects/unicodeobject.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f8e487a3c92c0a..e9417adf7035b4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9272,12 +9272,13 @@ tailmatch(PyObject *self, else offset = start; - int last_character_matches = PyUnicode_READ(kind_self, data_self, offset + end_sub) == - PyUnicode_READ(kind_sub, data_sub, end_sub); + int match_last = PyUnicode_READ(kind_self, data_self, offset + end_sub) == + PyUnicode_READ(kind_sub, data_sub, end_sub); - if (last_character_matches) { - if (end_sub==0) + if (match_last) { + if (end_sub==0) { return 1; + } /* If both are of the same kind, memcmp is sufficient */ if (kind_self == kind_sub) { return ! memcmp((char *)data_self + (offset * kind_sub), From 9f201b16c6d38a3b89a54fdc794410a1c0eb5f0a Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 11 Apr 2024 21:17:26 +0000 Subject: [PATCH 3/9] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst new file mode 100644 index 00000000000000..ea449637abc68e --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst @@ -0,0 +1 @@ +Improve performance of :func:`str.startswith` and `str.endswith`. From 8792d0b9d001a4c8a1b7e523e60f9450098a1e21 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Thu, 11 Apr 2024 23:44:01 +0200 Subject: [PATCH 4/9] lint --- .../2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst index ea449637abc68e..b6be9b7b66ba4f 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst @@ -1 +1 @@ -Improve performance of :func:`str.startswith` and `str.endswith`. +Improve performance of :func:`str.startswith` and :func:`str.endswith`. From 2a2cfb36840bd096ed4d1679d9ac37290f8c75e6 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Mon, 20 May 2024 23:04:00 +0200 Subject: [PATCH 5/9] Update Objects/unicodeobject.c Co-authored-by: Erlend E. Aasland --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1d13227fef282a..7fd29531ad55a4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9276,7 +9276,7 @@ tailmatch(PyObject *self, PyUnicode_READ(kind_sub, data_sub, end_sub); if (match_last) { - if (end_sub==0) { + if (end_sub == 0) { return 1; } /* If both are of the same kind, memcmp is sufficient */ From 9f8e4b880c9c8d08f4a4f5973e1b64db846ab0c7 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Mon, 20 May 2024 23:04:18 +0200 Subject: [PATCH 6/9] update comment --- Objects/unicodeobject.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1d13227fef282a..c27cb27763dc35 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9286,9 +9286,9 @@ tailmatch(PyObject *self, } /* otherwise we have to compare each character by first accessing it */ else { - /* We do not need to compare 0 and len(substring)-1 because - the if statement above ensured already that they are equal - when we end up here. */ + /* We do not need to compare len(substring)-1 because the if + statement above ensured already that they are equal when we + end up here. */ for (i = 0; i < end_sub; ++i) { if (PyUnicode_READ(kind_self, data_self, offset + i) != PyUnicode_READ(kind_sub, data_sub, i)) From 8a7b9fe363d838b8d7e7930e3cf9487a55ec23c8 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Tue, 21 May 2024 22:12:33 +0200 Subject: [PATCH 7/9] update news entry --- .../2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst index b6be9b7b66ba4f..19dc551118ae67 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst @@ -1 +1 @@ -Improve performance of :func:`str.startswith` and :func:`str.endswith`. +Improve performance of :func:`str.startswith`, :func:`str.endswith`, :func:`str.removeprefix` and :func:`str.removesuffix`. From ea862985e973c1b456421bd7b8197e4850c346ac Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Tue, 21 May 2024 22:52:14 +0200 Subject: [PATCH 8/9] cleanup --- Objects/unicodeobject.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 10737760dcceef..0d9fdc443cd315 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9254,7 +9254,8 @@ tailmatch(PyObject *self, Py_ssize_t end_sub; ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); - end -= PyUnicode_GET_LENGTH(substring); + Py_ssize_t substring_length = PyUnicode_GET_LENGTH(substring); + end -= substring_length; if (end < start) return 0; @@ -9265,18 +9266,25 @@ tailmatch(PyObject *self, data_self = PyUnicode_DATA(self); kind_sub = PyUnicode_KIND(substring); data_sub = PyUnicode_DATA(substring); - end_sub = PyUnicode_GET_LENGTH(substring) - 1; + end_sub = substring_length - 1; if (direction > 0) offset = end; else offset = start; - int match_last = PyUnicode_READ(kind_self, data_self, offset + end_sub) == - PyUnicode_READ(kind_sub, data_sub, end_sub); + int match_first = PyUnicode_READ(kind_self, data_self, offset) == + PyUnicode_READ(kind_sub, data_sub, 0); - if (match_last) { - if (end_sub == 0) { + if (match_first) { + if (substring_length == 1) { + // single-character case + return 1; + } + int match_last = PyUnicode_READ(kind_self, data_self, offset + end_sub) == + PyUnicode_READ(kind_sub, data_sub, end_sub); + if (!match_last && substring_length == 2) { + // failing two-character case return 1; } /* If both are of the same kind, memcmp is sufficient */ @@ -9286,10 +9294,10 @@ tailmatch(PyObject *self, } /* otherwise we have to compare each character by first accessing it */ else { - /* We do not need to compare len(substring)-1 because the if - statement above ensured already that they are equal when we + /* We do not need to compare 0 and len(substring)-1 because the if + statements above ensured already that they are equal when we end up here. */ - for (i = 0; i < end_sub; ++i) { + for (i = 1; i < end_sub; ++i) { if (PyUnicode_READ(kind_self, data_self, offset + i) != PyUnicode_READ(kind_sub, data_sub, i)) return 0; From 6aea844959e8ddc3b5b3a843b83106f5c4aead95 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Tue, 21 May 2024 22:17:38 +0200 Subject: [PATCH 9/9] Update Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst Co-authored-by: Erlend E. Aasland --- .../2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst index 19dc551118ae67..1e77d5ba1413b8 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-11-21-17-23.gh-issue-117431.ZxdAFN.rst @@ -1 +1 @@ -Improve performance of :func:`str.startswith`, :func:`str.endswith`, :func:`str.removeprefix` and :func:`str.removesuffix`. +Improve performance of :meth:`str.startswith`, :meth:`str.endswith`, :meth:`str.removeprefix` and :meth:`str.removesuffix`.