From 899f2894c22575f12c80081aae8959af5e8ada2e Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 11 Oct 2020 16:56:01 -0400 Subject: [PATCH 01/19] initial implementation --- Objects/stringlib/fastsearch.h | 238 +++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 56a4467d353813..ac6186d9fcf20c 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -17,6 +17,15 @@ #define FAST_SEARCH 1 #define FAST_RSEARCH 2 + +#if 0 && STRINGLIB_SIZEOF_CHAR == 1 +#define LOG(...) printf(__VA_ARGS__) +#define LOG_STRING(s, n) printf("%.*s", n, s) +#else +#define LOG(...) +#define LOG_STRING(s, n) +#endif + #if LONG_BIT >= 128 #define STRINGLIB_BLOOM_WIDTH 128 #elif LONG_BIT >= 64 @@ -160,6 +169,229 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) #undef MEMCHR_CUT_OFF + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + int inverted, Py_ssize_t *return_period) +{ + // We'll eventually partition needle into + // needle[:max_suffix + 1] + needle[max_suffix + 1:] + Py_ssize_t max_suffix = -1; + + Py_ssize_t suffix = 0; // candidate for max_suffix + Py_ssize_t period = 1; // candidate for return_period + Py_ssize_t k = 1; // working index + + while (suffix + k < needle_len) { + STRINGLIB_CHAR a = needle[suffix + k]; + STRINGLIB_CHAR b = needle[max_suffix + k]; + if (inverted ? (a < b) : (b < a)) { + // Suffix is smaller, period is entire prefix so far. + suffix += k; + k = 1; + period = suffix - max_suffix; + } + else if (a == b) { + // Advance through the repitition of the current period. + if (k != period) { + k++; + } + else { + suffix += period; + k = 1; + } + } + else { + // Found a bigger suffix. + max_suffix = suffix; + suffix += 1; + k = 1; + period = 1; + } + } + *return_period = period; + return max_suffix + 1; +} + + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + Py_ssize_t *return_period) +{ + Py_ssize_t period1, period2, max_suf1, max_suf2; + + // Search using both forward and inverted character-orderings + max_suf1 = STRINGLIB(_lex_search)(needle, needle_len, 0, &period1); + max_suf2 = STRINGLIB(_lex_search)(needle, needle_len, 1, &period2); + + // Choose the later suffix + if (max_suf2 < max_suf1) { + *return_period = period1; + return max_suf1; + } + else { + *return_period = period2; + return max_suf2; + } +} + + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, + Py_ssize_t suffix, Py_ssize_t period) +{ + LOG("========================\n"); + LOG("Two-way with needle="); LOG_STRING(needle, needle_len); + LOG(" and haystack="); LOG_STRING(haystack, haystack_len); + LOG("\nSplit "); LOG_STRING(needle, needle_len); + LOG(" into "); LOG_STRING(needle, suffix); + LOG(" and "); LOG_STRING(needle + suffix, needle_len - suffix); + LOG(".\n"); + + if (memcmp(needle, needle+period, suffix * STRINGLIB_SIZEOF_CHAR) == 0) { + LOG("needle is completely periodic.\n"); + // a mismatch can only advance by the period. + // use memory to avoid re-scanning known occurrences of the period. + Py_ssize_t memory = 0; + Py_ssize_t j = 0; // index into haystack + while (j <= haystack_len - needle_len) { + // Visualize the line-up: + LOG("> "); LOG_STRING(haystack, haystack_len); + LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); + LOG("\n"); + + LOG("Scanning right half.\n"); + Py_ssize_t i = Py_MAX(suffix, memory); + while (i < needle_len && needle[i] == haystack[j+i]) { + i++; + } + if (i >= needle_len) { + LOG("Right half matched. Scanning left half.\n"); + i = suffix - 1; + while (memory < i + 1 && needle[i] == haystack[j+i]) { + i--; + } + if (i + 1 < memory + 1) { + LOG("Left half matches. Returning %d.\n", j); + return j; + } + LOG("No match.\n"); + // Remember how many periods were scanned on the right + j += period; + memory = needle_len - period; + } + else { + LOG("Skip without checking left half.\n"); + j += i - suffix + 1; + memory = 0; + } + } + } + else { + LOG("needle is NOT completely periodic.\n"); + // The two halves are distinct; + // no extra memory is required, + // and a mismatch results in a maximal shift. + period = 1 + Py_MAX(suffix, needle_len - suffix); + STRINGLIB_CHAR suffix_start = needle[suffix]; + LOG("Using period %d.\n", period); + LOG("Right half starts with %c\n", suffix_start); + + Py_ssize_t j = 0; + while (j <= haystack_len - needle_len) { + // use faster code looking for first char. + Py_ssize_t find; + find = STRINGLIB(find_char)(haystack + suffix + j, + haystack_len - needle_len - j + 1, + suffix_start); + if (find == -1) { + LOG("Not found. Return -1.\n"); + return -1; + } + j += find; + + LOG("Found at %d.\n", j); + + LOG("> "); LOG_STRING(haystack, haystack_len); + LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); + LOG("\n"); + + LOG("Checking the right half.\n"); + Py_ssize_t i = suffix+1; + for (; i < needle_len; i++) { + if (needle[i] != haystack[j + i]){ + LOG("No match.\n"); + break; + } + } + + if (needle_len <= i) { + LOG("Matches. Checking the left half.\n"); + i = suffix - 1; + for (i = suffix - 1; i >= 0; i--) { + if (needle[i] != haystack[j + i]) { + break; + } + } + if (i == -1) { + LOG("Match! (at %d)\n", j); + return j; + } + j += period; + } + else { + LOG("Jump forward without checking left half.\n"); + j += i - suffix + 1; + } + } + + } + LOG("Reached end. Returning -1.\n"); + return -1; +} + + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(memmem)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) +{ + LOG("memmem: needle="); LOG_STRING(needle, needle_len); + LOG(" and haystack="); LOG_STRING(haystack, haystack_len); + LOG("\n"); + if (needle_len == 0) { + LOG("Easy out: empty.\n"); + return 0; + } + STRINGLIB_CHAR first = needle[0]; + + Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); + if (index == -1) { + LOG("Easy out: empty.\n"); + return -1; + } + + if (haystack_len - index < needle_len) { + LOG("Easy out: no room left after first found.\n"); + return -1; + } + + // Do a fast compare to avoid the initialization overhead + if (memcmp(haystack+index, needle, needle_len*STRINGLIB_SIZEOF_CHAR) == 0) { + LOG("Easy out: Naive guess was correct.\n"); + return index; + } + + // Start later. + index++; + Py_ssize_t period, suffix; + suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); + return STRINGLIB(_two_way)(needle, needle_len, + haystack, haystack_len, + suffix, period); +} + + Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, @@ -198,6 +430,10 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, skip = mlast - 1; mask = 0; + if (mode == FAST_SEARCH) { + return STRINGLIB(memmem)(p, m, s, n); + } + if (mode != FAST_RSEARCH) { const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; @@ -281,3 +517,5 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return count; } +#undef LOG +#undef LOG_STRING From 743a382f447f38ccd31c98db0e215c1f6293e88a Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 12 Oct 2020 02:43:36 -0400 Subject: [PATCH 02/19] refactoring --- Objects/stringlib/fastsearch.h | 192 +++++++++++++++++---------------- 1 file changed, 98 insertions(+), 94 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index ac6186d9fcf20c..22bbc58105101e 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -2,22 +2,23 @@ #define STRINGLIB_FASTSEARCH_H -/* fast search/count implementation, based on a mix between boyer- + +/* FAST_SEARCH and FAST_COUNT use the two-way algorithm. See: + http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm + Largely influenced by glibc: + https://code.woboq.org/userspace/glibc/string/str-two-way.h.html + https://code.woboq.org/userspace/glibc/string/memmem.c.html + + FAST_RSEARCH uses another algorithm, based on a mix between boyer- moore and horspool, with a few more bells and whistles on the top. for some more background, see: http://effbot.org/zone/stringlib.htm */ -/* note: fastsearch may access s[n], which isn't a problem when using - Python's ordinary string types, but may cause problems if you're - using this code in other contexts. also, the count mode returns -1 - if there cannot possible be a match in the target string, and 0 if - it has actually checked for matches, but didn't find any. callers - beware! */ - #define FAST_COUNT 0 #define FAST_SEARCH 1 #define FAST_RSEARCH 2 - +/* Change to a 1 to see logging comments walk through the algorithm. */ #if 0 && STRINGLIB_SIZEOF_CHAR == 1 #define LOG(...) printf(__VA_ARGS__) #define LOG_STRING(s, n) printf("%.*s", n, s) @@ -218,6 +219,13 @@ Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, Py_ssize_t *return_period) { + /* Morally, this is what we want to happen: + >>> x = "GCAGAGAG" + >>> suf, period = _critical_factorization(x) + >>> x[:suf], x[suf:] + ('GC', 'AGAGAG') + >>> period + 2 */ Py_ssize_t period1, period2, max_suf1, max_suf2; // Search using both forward and inverted character-orderings @@ -353,8 +361,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(memmem)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, - const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) +STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) { LOG("memmem: needle="); LOG_STRING(needle, needle_len); LOG(" and haystack="); LOG_STRING(haystack, haystack_len); @@ -371,12 +379,12 @@ STRINGLIB(memmem)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, return -1; } - if (haystack_len - index < needle_len) { + if (haystack_len < needle_len + index) { LOG("Easy out: no room left after first found.\n"); return -1; } - // Do a fast compare to avoid the initialization overhead + // Do a fast compare to maybe avoid the initialization overhead if (memcmp(haystack+index, needle, needle_len*STRINGLIB_SIZEOF_CHAR) == 0) { LOG("Easy out: Naive guess was correct.\n"); return index; @@ -386,98 +394,97 @@ STRINGLIB(memmem)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, index++; Py_ssize_t period, suffix; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); - return STRINGLIB(_two_way)(needle, needle_len, - haystack, haystack_len, - suffix, period); + Py_ssize_t result = STRINGLIB(_two_way)(needle, needle_len, + haystack + index, + haystack_len - index, + suffix, period); + + if (result == -1) { + return -1; + } + return index + result; } Py_LOCAL_INLINE(Py_ssize_t) -FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, - const STRINGLIB_CHAR* p, Py_ssize_t m, - Py_ssize_t maxcount, int mode) +STRINGLIB(_fastcount)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, + Py_ssize_t maxcount) { - unsigned long mask; - Py_ssize_t skip, count = 0; - Py_ssize_t i, j, mlast, w; - - w = n - m; - - if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) - return -1; - - /* look for special cases */ - if (m <= 1) { - if (m <= 0) - return -1; - /* use special case for 1-character strings */ - if (mode == FAST_SEARCH) - return STRINGLIB(find_char)(s, n, p[0]); - else if (mode == FAST_RSEARCH) - return STRINGLIB(rfind_char)(s, n, p[0]); - else { /* FAST_COUNT */ - for (i = 0; i < n; i++) - if (s[i] == p[0]) { - count++; - if (count == maxcount) - return maxcount; + if (maxcount == 0) { + return 0; + } + if (needle_len == 1) { + Py_ssize_t count = 0; + for (Py_ssize_t i = 0; i < haystack_len; i++) { + if (haystack[i] == needle[0]) { + count++; + if (count == maxcount) { + return maxcount; } + } + } + return count; + } + if (needle_len == 0) { + return haystack_len + 1; + } + STRINGLIB_CHAR first = needle[0]; + Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); + if (index == -1) { + return 0; + } + if (haystack_len < needle_len + index) { + return -1; + } + Py_ssize_t suffix, period; + suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); + Py_ssize_t count = 0; + while (1) { + Py_ssize_t result = STRINGLIB(_two_way)(needle, needle_len, + haystack + index, + haystack_len - index, + suffix, period); + if (result == -1) { return count; } + else { + count++; + if (count == maxcount) { + return maxcount; + } + index += result + needle_len; + } } - mlast = m - 1; - skip = mlast - 1; - mask = 0; - - if (mode == FAST_SEARCH) { - return STRINGLIB(memmem)(p, m, s, n); - } +} - if (mode != FAST_RSEARCH) { - const STRINGLIB_CHAR *ss = s + m - 1; - const STRINGLIB_CHAR *pp = p + m - 1; - /* create compressed boyer-moore delta 1 table */ +Py_LOCAL_INLINE(Py_ssize_t) +FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode) +{ + if (m > n) { + return -1; + } - /* process pattern[:-1] */ - for (i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[mlast]) - skip = mlast - i - 1; + if (mode == FAST_SEARCH) { + return STRINGLIB(_fastsearch)(p, m, s, n); + } + else if (mode == FAST_COUNT) { + return STRINGLIB(_fastcount)(p, m, s, n, maxcount); + } + else { /* FAST_RSEARCH */ + if (m == 1) { + return STRINGLIB(rfind_char)(s, n, p[0]); } - /* process pattern[-1] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[mlast]); - for (i = 0; i <= w; i++) { - /* note: using mlast in the skip path slows things down on x86 */ - if (ss[i] == pp[0]) { - /* candidate match */ - for (j = 0; j < mlast; j++) - if (s[i+j] != p[j]) - break; - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) - return i; - count++; - if (count == maxcount) - return maxcount; - i = i + mlast; - continue; - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) - i = i + m; - else - i = i + skip; - } else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) - i = i + m; - } - } - } else { /* FAST_RSEARCH */ + Py_ssize_t w = n - m; + Py_ssize_t mlast = m - 1; + Py_ssize_t skip = mlast - 1; + Py_ssize_t mask = 0; + Py_ssize_t i, j; /* create compressed boyer-moore delta 1 table */ @@ -510,11 +517,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, i = i - m; } } - } - - if (mode != FAST_COUNT) return -1; - return count; + } } #undef LOG From fdb68009f7c4ef4d9c549b6a0e5bcc76926b90d3 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 12 Oct 2020 12:34:45 -0400 Subject: [PATCH 03/19] formatting fixes --- Objects/stringlib/fastsearch.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 22bbc58105101e..45ccf7a150b57f 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -8,7 +8,7 @@ https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm Largely influenced by glibc: https://code.woboq.org/userspace/glibc/string/str-two-way.h.html - https://code.woboq.org/userspace/glibc/string/memmem.c.html + https://code.woboq.org/userspace/glibc/string/memmem.c.html FAST_RSEARCH uses another algorithm, based on a mix between boyer- moore and horspool, with a few more bells and whistles on the top. @@ -227,7 +227,7 @@ STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t need >>> period 2 */ Py_ssize_t period1, period2, max_suf1, max_suf2; - + // Search using both forward and inverted character-orderings max_suf1 = STRINGLIB(_lex_search)(needle, needle_len, 0, &period1); max_suf2 = STRINGLIB(_lex_search)(needle, needle_len, 1, &period2); @@ -320,7 +320,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, j += find; LOG("Found at %d.\n", j); - + LOG("> "); LOG_STRING(haystack, haystack_len); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); @@ -331,7 +331,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, if (needle[i] != haystack[j + i]){ LOG("No match.\n"); break; - } + } } if (needle_len <= i) { @@ -364,29 +364,22 @@ Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) { - LOG("memmem: needle="); LOG_STRING(needle, needle_len); - LOG(" and haystack="); LOG_STRING(haystack, haystack_len); - LOG("\n"); if (needle_len == 0) { - LOG("Easy out: empty.\n"); return 0; } STRINGLIB_CHAR first = needle[0]; Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); if (index == -1) { - LOG("Easy out: empty.\n"); return -1; } if (haystack_len < needle_len + index) { - LOG("Easy out: no room left after first found.\n"); return -1; } // Do a fast compare to maybe avoid the initialization overhead if (memcmp(haystack+index, needle, needle_len*STRINGLIB_SIZEOF_CHAR) == 0) { - LOG("Easy out: Naive guess was correct.\n"); return index; } From 737ac8a3a328e0c5f00a918e6e8308d42a878917 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 12 Oct 2020 17:22:30 -0400 Subject: [PATCH 04/19] add shift and bloom --- Objects/stringlib/fastsearch.h | 50 ++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 45ccf7a150b57f..470079ee0e36d1 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -256,6 +256,12 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, LOG(" into "); LOG_STRING(needle, suffix); LOG(" and "); LOG_STRING(needle + suffix, needle_len - suffix); LOG(".\n"); + unsigned long mask = 0; + + /* Get the set of characters (mod 2^k) in the needle. */ + for (Py_ssize_t i = 0; i < needle_len; i++) { + STRINGLIB_BLOOM_ADD(mask, needle[i]); + } if (memcmp(needle, needle+period, suffix * STRINGLIB_SIZEOF_CHAR) == 0) { LOG("needle is completely periodic.\n"); @@ -264,11 +270,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, Py_ssize_t memory = 0; Py_ssize_t j = 0; // index into haystack while (j <= haystack_len - needle_len) { + // Visualize the line-up: LOG("> "); LOG_STRING(haystack, haystack_len); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); + if (!STRINGLIB_BLOOM(mask, haystack[j + needle_len - 1])) { + LOG("'%c' not in needle; skipping ahead!\n", haystack[j + needle_len - 1]); + memory = 0; + j += needle_len; + continue; + } + LOG("Scanning right half.\n"); Py_ssize_t i = Py_MAX(suffix, memory); while (i < needle_len && needle[i] == haystack[j+i]) { @@ -298,16 +312,33 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("needle is NOT completely periodic.\n"); + + period = 1 + Py_MAX(suffix, needle_len - suffix); + STRINGLIB_CHAR suffix_start = needle[suffix]; + + // Compute the distance between suffix_start and the pervious + // occurrence of suffix_start. + Py_ssize_t shift = suffix; + for (Py_ssize_t k = suffix - 1; k >= 0; k--) { + if (needle[k] == suffix_start) { + shift = suffix - k; + break; + } + } + // The two halves are distinct; // no extra memory is required, // and a mismatch results in a maximal shift. - period = 1 + Py_MAX(suffix, needle_len - suffix); - STRINGLIB_CHAR suffix_start = needle[suffix]; LOG("Using period %d.\n", period); LOG("Right half starts with %c\n", suffix_start); Py_ssize_t j = 0; while (j <= haystack_len - needle_len) { + if (!STRINGLIB_BLOOM(mask, haystack[j + needle_len - 1])) { + LOG("'%c' not in needle; skipping ahead!\n", haystack[j + needle_len - 1]); + j += needle_len; + continue; + } // use faster code looking for first char. Py_ssize_t find; find = STRINGLIB(find_char)(haystack + suffix + j, @@ -321,12 +352,18 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, LOG("Found at %d.\n", j); + if (!STRINGLIB_BLOOM(mask, haystack[j + needle_len - 1])) { + LOG("'%c' not in needle; skipping ahead!\n", haystack[j + needle_len - 1]); + j += needle_len; + continue; + } + LOG("> "); LOG_STRING(haystack, haystack_len); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); LOG("Checking the right half.\n"); - Py_ssize_t i = suffix+1; + Py_ssize_t i = suffix + 1; for (; i < needle_len; i++) { if (needle[i] != haystack[j + i]){ LOG("No match.\n"); @@ -334,7 +371,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } } - if (needle_len <= i) { + if (i >= needle_len) { LOG("Matches. Checking the left half.\n"); i = suffix - 1; for (i = suffix - 1; i >= 0; i--) { @@ -350,7 +387,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("Jump forward without checking left half.\n"); - j += i - suffix + 1; + // Note: In common cases, "shift" wins. + j += Py_MAX(shift, i - suffix + 1); } } @@ -365,7 +403,7 @@ STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) { if (needle_len == 0) { - return 0; + return -1; } STRINGLIB_CHAR first = needle[0]; From 658038d575e519c05d9b734b0f5f3f7ea988193b Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 12 Oct 2020 23:46:52 +0000 Subject: [PATCH 05/19] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst b/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst new file mode 100644 index 00000000000000..733e82a7187539 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst @@ -0,0 +1 @@ +Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now use the "Two-Way" string comparison algorithm to avoid quadratic behavior in the worst cases. \ No newline at end of file From b62e4c69d31a3bc6cb50a235e7288105712617dd Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Tue, 13 Oct 2020 23:59:40 -0400 Subject: [PATCH 06/19] add alternating find_char calls --- Objects/stringlib/fastsearch.h | 115 ++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 31 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 470079ee0e36d1..613d30299b9edc 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -21,7 +21,7 @@ /* Change to a 1 to see logging comments walk through the algorithm. */ #if 0 && STRINGLIB_SIZEOF_CHAR == 1 #define LOG(...) printf(__VA_ARGS__) -#define LOG_STRING(s, n) printf("%.*s", n, s) +#define LOG_STRING(s, n) printf("\"%.*s\"", n, s) #else #define LOG(...) #define LOG_STRING(s, n) @@ -313,51 +313,104 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, else { LOG("needle is NOT completely periodic.\n"); + // The two halves are distinct; + // no extra memory is required, + // and a mismatch results in a maximal shift. period = 1 + Py_MAX(suffix, needle_len - suffix); STRINGLIB_CHAR suffix_start = needle[suffix]; + STRINGLIB_CHAR suffix_end = needle[needle_len - 1]; + + LOG("Using period %d.\n", period); + LOG("Right half starts with %c\n", suffix_start); + LOG("Right half endswith %c\n", suffix_end); // Compute the distance between suffix_start and the pervious // occurrence of suffix_start. - Py_ssize_t shift = suffix; + Py_ssize_t middle_shift = suffix; for (Py_ssize_t k = suffix - 1; k >= 0; k--) { if (needle[k] == suffix_start) { - shift = suffix - k; + middle_shift = suffix - k; break; } } - // The two halves are distinct; - // no extra memory is required, - // and a mismatch results in a maximal shift. - LOG("Using period %d.\n", period); - LOG("Right half starts with %c\n", suffix_start); - - Py_ssize_t j = 0; - while (j <= haystack_len - needle_len) { - if (!STRINGLIB_BLOOM(mask, haystack[j + needle_len - 1])) { - LOG("'%c' not in needle; skipping ahead!\n", haystack[j + needle_len - 1]); - j += needle_len; - continue; + Py_ssize_t end_shift = needle_len; + for (Py_ssize_t k = needle_len - 1; k >= 0; k--) { + if (needle[k] == suffix_end) { + end_shift = needle_len - k; + break; } - // use faster code looking for first char. - Py_ssize_t find; - find = STRINGLIB(find_char)(haystack + suffix + j, - haystack_len - needle_len - j + 1, - suffix_start); - if (find == -1) { - LOG("Not found. Return -1.\n"); - return -1; + } + + Py_ssize_t both_shift = suffix; + for (Py_ssize_t k = 1; suffix - k >= 0; k++) { + if (needle[suffix - k] == suffix_start + && suffix_start == needle[needle_len - 1 - k]) + { + both_shift = k; + break; } - j += find; + } - LOG("Found at %d.\n", j); + Py_ssize_t j = 0; + while (j <= haystack_len - needle_len) { + while (1) { + // scan until middle matches + Py_ssize_t find; + find = STRINGLIB(find_char)(haystack + j + suffix, + haystack_len - j - needle_len + 1, + suffix_start); + if (find == -1) { + return -1; + } + else { + j += find; + if (j > haystack_len - needle_len) { + assert(j <= haystack_len - needle_len); + } + } + if (haystack[j + suffix] != suffix_start) { + assert(haystack[j + suffix] == suffix_start); + } - if (!STRINGLIB_BLOOM(mask, haystack[j + needle_len - 1])) { - LOG("'%c' not in needle; skipping ahead!\n", haystack[j + needle_len - 1]); - j += needle_len; - continue; + STRINGLIB_CHAR end = haystack[j+needle_len-1]; + if (end == suffix_end) { + break; + } + else if (!STRINGLIB_BLOOM(mask, end)) { + j += needle_len; + } + else { + j += middle_shift; + } + if (j > haystack_len - needle_len) { + return -1; + } + find = STRINGLIB(find_char)(haystack + j + needle_len - 1, + haystack_len - j - needle_len + 1, + suffix_end); + if (find == -1) { + return -1; + } + else { + j += find; + assert(j <= haystack_len - needle_len); + } + assert(haystack[j + needle_len - 1] == suffix_end); + if (haystack[j+suffix] == suffix_start) { + break; + } + else { + j += end_shift; + } + if (j > haystack_len - needle_len) { + return -1; + } } + assert(haystack[j + suffix] == suffix_start); + assert(haystack[j + needle_len - 1] == suffix_end); + LOG("> "); LOG_STRING(haystack, haystack_len); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); @@ -387,8 +440,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("Jump forward without checking left half.\n"); - // Note: In common cases, "shift" wins. - j += Py_MAX(shift, i - suffix + 1); + // Note: In common cases, "both_shift" wins. + j += Py_MAX(both_shift, i - suffix + 1); } } From 25a61fb596f3c640d4efda445d1a4f6610aa4c5c Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Wed, 14 Oct 2020 12:08:47 -0400 Subject: [PATCH 07/19] USe a shift table --- Objects/stringlib/fastsearch.h | 205 +++++++++++++++++---------------- 1 file changed, 104 insertions(+), 101 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 613d30299b9edc..37f8c5a0c51128 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -193,7 +193,7 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, period = suffix - max_suffix; } else if (a == b) { - // Advance through the repitition of the current period. + // Advance through the repetition of the current period. if (k != period) { k++; } @@ -243,11 +243,39 @@ STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t need } } +#define SHIFT_TYPE uint16_t +#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) +#define SHIFT_OVERFLOW (NOT_FOUND - 1U) + +#define TABLE_SIZE_BITS 7 +#define TABLE_SIZE (1U << TABLE_SIZE_BITS) +#define TABLE_MASK (TABLE_SIZE - 1U) + +Py_LOCAL_INLINE(void) +STRINGLIB(_init_table)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, + SHIFT_TYPE *table) +{ + // Fill the table with TABLE_MASK + memset(table, 0xff, TABLE_SIZE * sizeof(SHIFT_TYPE)); + assert(table[0] == NOT_FOUND); + assert(table[TABLE_SIZE - 1] == NOT_FOUND); + for (Py_ssize_t j = 0; j < needle_len; j++) { + // CODE: + // TABLE_MASK means not in string + // TABLE_MASK-1 means shift at least TABLE_MASK-1 + Py_ssize_t shift = needle_len - j - 1; + if (shift > SHIFT_OVERFLOW) { + shift = SHIFT_OVERFLOW; + } + table[needle[j] & TABLE_MASK] = shift; + } +} Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, - Py_ssize_t suffix, Py_ssize_t period) + Py_ssize_t suffix, Py_ssize_t period, + SHIFT_TYPE *shift_table) { LOG("========================\n"); LOG("Two-way with needle="); LOG_STRING(needle, needle_len); @@ -256,12 +284,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, LOG(" into "); LOG_STRING(needle, suffix); LOG(" and "); LOG_STRING(needle + suffix, needle_len - suffix); LOG(".\n"); - unsigned long mask = 0; - - /* Get the set of characters (mod 2^k) in the needle. */ - for (Py_ssize_t i = 0; i < needle_len; i++) { - STRINGLIB_BLOOM_ADD(mask, needle[i]); - } if (memcmp(needle, needle+period, suffix * STRINGLIB_SIZEOF_CHAR) == 0) { LOG("needle is completely periodic.\n"); @@ -276,11 +298,38 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); LOG("\n"); - if (!STRINGLIB_BLOOM(mask, haystack[j + needle_len - 1])) { - LOG("'%c' not in needle; skipping ahead!\n", haystack[j + needle_len - 1]); - memory = 0; - j += needle_len; - continue; + STRINGLIB_CHAR last = haystack[j + needle_len - 1]; + int index = last & TABLE_MASK; + SHIFT_TYPE shift = shift_table[index]; + + //SHIFT_TYPE shift = shift_table[haystack[j + needle_len - 1] && TABLE_MASK]; + switch (shift) + { + case 0: { + break; + } + case NOT_FOUND: { + LOG("Last character not found in string.\n"); + memory = 0; + j += needle_len; + continue; + } + case SHIFT_OVERFLOW: { + LOG("Shift overflowed.\n"); + memory = 0; + j += SHIFT_OVERFLOW; + continue; + } + default: { + if (memory && shift < period) { + LOG("Shifting through multiple periods.\n"); + j += needle_len - period; + } else { + LOG("Table says shift by %d.\n", shift); + j += shift; + } + memory = 0; + } } LOG("Scanning right half.\n"); @@ -317,106 +366,45 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, // no extra memory is required, // and a mismatch results in a maximal shift. period = 1 + Py_MAX(suffix, needle_len - suffix); - STRINGLIB_CHAR suffix_start = needle[suffix]; - STRINGLIB_CHAR suffix_end = needle[needle_len - 1]; LOG("Using period %d.\n", period); - LOG("Right half starts with %c\n", suffix_start); - LOG("Right half endswith %c\n", suffix_end); - - // Compute the distance between suffix_start and the pervious - // occurrence of suffix_start. - Py_ssize_t middle_shift = suffix; - for (Py_ssize_t k = suffix - 1; k >= 0; k--) { - if (needle[k] == suffix_start) { - middle_shift = suffix - k; - break; - } - } - - Py_ssize_t end_shift = needle_len; - for (Py_ssize_t k = needle_len - 1; k >= 0; k--) { - if (needle[k] == suffix_end) { - end_shift = needle_len - k; - break; - } - } - - Py_ssize_t both_shift = suffix; - for (Py_ssize_t k = 1; suffix - k >= 0; k++) { - if (needle[suffix - k] == suffix_start - && suffix_start == needle[needle_len - 1 - k]) - { - both_shift = k; - break; - } - } Py_ssize_t j = 0; while (j <= haystack_len - needle_len) { - while (1) { - // scan until middle matches - Py_ssize_t find; - find = STRINGLIB(find_char)(haystack + j + suffix, - haystack_len - j - needle_len + 1, - suffix_start); - if (find == -1) { - return -1; - } - else { - j += find; - if (j > haystack_len - needle_len) { - assert(j <= haystack_len - needle_len); - } - } - if (haystack[j + suffix] != suffix_start) { - assert(haystack[j + suffix] == suffix_start); - } + LOG("> "); LOG_STRING(haystack, haystack_len); + LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); + LOG("\n"); - STRINGLIB_CHAR end = haystack[j+needle_len-1]; - if (end == suffix_end) { + STRINGLIB_CHAR last = haystack[j + needle_len - 1]; + int index = last & TABLE_MASK; + SHIFT_TYPE shift = shift_table[index]; + switch (shift) + { + case 0: { break; } - else if (!STRINGLIB_BLOOM(mask, end)) { + case NOT_FOUND: { + LOG("Last character not found in string.\n"); j += needle_len; + continue; } - else { - j += middle_shift; - } - if (j > haystack_len - needle_len) { - return -1; - } - find = STRINGLIB(find_char)(haystack + j + needle_len - 1, - haystack_len - j - needle_len + 1, - suffix_end); - if (find == -1) { - return -1; - } - else { - j += find; - assert(j <= haystack_len - needle_len); - } - assert(haystack[j + needle_len - 1] == suffix_end); - if (haystack[j+suffix] == suffix_start) { - break; + case SHIFT_OVERFLOW: { + LOG("Shift overflowed.\n"); + j += SHIFT_OVERFLOW; + continue; } - else { - j += end_shift; - } - if (j > haystack_len - needle_len) { - return -1; + default: { + LOG("Table says shift by %d.\n", shift); + j += shift; } } - assert(haystack[j + suffix] == suffix_start); - assert(haystack[j + needle_len - 1] == suffix_end); - - LOG("> "); LOG_STRING(haystack, haystack_len); - LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, needle_len); - LOG("\n"); + if (j > haystack_len - needle_len) { + return -1; + } LOG("Checking the right half.\n"); - Py_ssize_t i = suffix + 1; + Py_ssize_t i = suffix; for (; i < needle_len; i++) { if (needle[i] != haystack[j + i]){ LOG("No match.\n"); @@ -440,8 +428,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("Jump forward without checking left half.\n"); - // Note: In common cases, "both_shift" wins. - j += Py_MAX(both_shift, i - suffix + 1); + j += i - suffix + 1; } } @@ -478,10 +465,16 @@ STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, index++; Py_ssize_t period, suffix; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); + + // make a skip table + SHIFT_TYPE shift_table[TABLE_SIZE]; + STRINGLIB(_init_table)(needle, needle_len, shift_table); + Py_ssize_t result = STRINGLIB(_two_way)(needle, needle_len, haystack + index, haystack_len - index, - suffix, period); + suffix, period, + shift_table); if (result == -1) { return -1; @@ -523,12 +516,15 @@ STRINGLIB(_fastcount)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } Py_ssize_t suffix, period; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); + SHIFT_TYPE shift_table[TABLE_SIZE]; + STRINGLIB(_init_table)(needle, needle_len, shift_table); Py_ssize_t count = 0; while (1) { Py_ssize_t result = STRINGLIB(_two_way)(needle, needle_len, haystack + index, haystack_len - index, - suffix, period); + suffix, period, + shift_table); if (result == -1) { return count; } @@ -543,6 +539,13 @@ STRINGLIB(_fastcount)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } +#undef SHIFT_TYPE +#undef NOT_FOUND +#undef SHIFT_OVERFLOW +#undef TABLE_SIZE_BITS +#undef TABLE_SIZE +#undef TABLE_MASK + Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, From 64b9a0a3e54ed863a88610ce602f0ce90502d0af Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Wed, 14 Oct 2020 16:25:34 -0400 Subject: [PATCH 08/19] compute a shift for the last character --- Objects/stringlib/fastsearch.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 37f8c5a0c51128..40a52fd9d269f9 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -267,7 +267,7 @@ STRINGLIB(_init_table)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, if (shift > SHIFT_OVERFLOW) { shift = SHIFT_OVERFLOW; } - table[needle[j] & TABLE_MASK] = shift; + table[needle[j] & TABLE_MASK] = (SHIFT_TYPE)shift; } } @@ -362,6 +362,15 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, else { LOG("needle is NOT completely periodic.\n"); + Py_ssize_t shift = needle_len; + STRINGLIB_CHAR last_in_needle = needle[needle_len - 1]; + for (Py_ssize_t i = needle_len - 1; i >= 0; i++) { + if ((last_in_needle & TABLE_MASK) == (needle[i] & TABLE_MASK)) { + shift = i; + break; + } + } + LOG("Last character shift is %d.\n", shift); // The two halves are distinct; // no extra memory is required, // and a mismatch results in a maximal shift. @@ -396,12 +405,12 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, default: { LOG("Table says shift by %d.\n", shift); j += shift; + continue; } } - if (j > haystack_len - needle_len) { - return -1; - } + assert((haystack[j + needle_len - 1] & TABLE_MASK) + == (needle[needle_len - 1] & TABLE_MASK)); LOG("Checking the right half.\n"); Py_ssize_t i = suffix; @@ -428,7 +437,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("Jump forward without checking left half.\n"); - j += i - suffix + 1; + j += Py_MAX(shift, i - suffix + 1); } } @@ -570,7 +579,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, Py_ssize_t w = n - m; Py_ssize_t mlast = m - 1; Py_ssize_t skip = mlast - 1; - Py_ssize_t mask = 0; + unsigned long mask = 0; Py_ssize_t i, j; /* create compressed boyer-moore delta 1 table */ From 5568ca2b75006791ceeec6a310822530031e97e3 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Wed, 14 Oct 2020 16:38:19 -0400 Subject: [PATCH 09/19] Remove unnecessary special case --- Objects/stringlib/fastsearch.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 40a52fd9d269f9..6368352a5aefd7 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -397,11 +397,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, j += needle_len; continue; } - case SHIFT_OVERFLOW: { - LOG("Shift overflowed.\n"); - j += SHIFT_OVERFLOW; - continue; - } default: { LOG("Table says shift by %d.\n", shift); j += shift; From 415f49253d787a8f6a3118efac775b2d8dfd686e Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 15 Oct 2020 03:10:35 -0400 Subject: [PATCH 10/19] removed unneeded shift computation --- Objects/stringlib/fastsearch.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 6368352a5aefd7..96f20cf6772bf0 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -329,6 +329,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, j += shift; } memory = 0; + continue; } } @@ -361,21 +362,10 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("needle is NOT completely periodic.\n"); - - Py_ssize_t shift = needle_len; - STRINGLIB_CHAR last_in_needle = needle[needle_len - 1]; - for (Py_ssize_t i = needle_len - 1; i >= 0; i++) { - if ((last_in_needle & TABLE_MASK) == (needle[i] & TABLE_MASK)) { - shift = i; - break; - } - } - LOG("Last character shift is %d.\n", shift); // The two halves are distinct; // no extra memory is required, // and a mismatch results in a maximal shift. period = 1 + Py_MAX(suffix, needle_len - suffix); - LOG("Using period %d.\n", period); Py_ssize_t j = 0; @@ -432,7 +422,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } else { LOG("Jump forward without checking left half.\n"); - j += Py_MAX(shift, i - suffix + 1); + j += i - suffix + 1; } } From 06c3678ba3661bce4ddbb2cf8b39b55f562849d0 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 17 Oct 2020 02:52:41 -0400 Subject: [PATCH 11/19] restore original code with special case for long needles --- Objects/stringlib/fastsearch.h | 122 +++++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 27 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 96f20cf6772bf0..87419168f16f87 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -302,7 +302,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, int index = last & TABLE_MASK; SHIFT_TYPE shift = shift_table[index]; - //SHIFT_TYPE shift = shift_table[haystack[j + needle_len - 1] && TABLE_MASK]; switch (shift) { case 0: { @@ -431,7 +430,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, return -1; } - Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) @@ -441,26 +439,27 @@ STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } STRINGLIB_CHAR first = needle[0]; - Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); + Py_ssize_t index = STRINGLIB(find_char)(haystack, + haystack_len - needle_len + 1, + first); if (index == -1) { return -1; } - if (haystack_len < needle_len + index) { - return -1; - } - - // Do a fast compare to maybe avoid the initialization overhead + // Do a fast compare in all cases to maybe avoid the initialization overhead if (memcmp(haystack+index, needle, needle_len*STRINGLIB_SIZEOF_CHAR) == 0) { return index; } + else { + // Start later. + index++; + } - // Start later. - index++; + // Prework: factorize. Py_ssize_t period, suffix; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); - // make a skip table + // Prework: make a skip table. SHIFT_TYPE shift_table[TABLE_SIZE]; STRINGLIB(_init_table)(needle, needle_len, shift_table); @@ -546,26 +545,92 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode) { - if (m > n) { + unsigned long mask; + Py_ssize_t skip, count = 0; + Py_ssize_t i, j, mlast, w; + + w = n - m; + + if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) return -1; - } - if (mode == FAST_SEARCH) { - return STRINGLIB(_fastsearch)(p, m, s, n); - } - else if (mode == FAST_COUNT) { - return STRINGLIB(_fastcount)(p, m, s, n, maxcount); - } - else { /* FAST_RSEARCH */ - if (m == 1) { + /* look for special cases */ + if (m <= 1) { + if (m <= 0) + return -1; + /* use special case for 1-character strings */ + if (mode == FAST_SEARCH) + return STRINGLIB(find_char)(s, n, p[0]); + else if (mode == FAST_RSEARCH) return STRINGLIB(rfind_char)(s, n, p[0]); + else { /* FAST_COUNT */ + for (i = 0; i < n; i++) + if (s[i] == p[0]) { + count++; + if (count == maxcount) + return maxcount; + } + return count; + } + } + + mlast = m - 1; + skip = mlast; + mask = 0; + + if (mode != FAST_RSEARCH) { + if (m >= 10) { + /* long needles get the two-way algorithm. */ + if (mode == FAST_SEARCH) { + return STRINGLIB(_fastsearch)(p, m, s, n); + } + else { + return STRINGLIB(_fastcount)(p, m, s, n, maxcount); + } + } + const STRINGLIB_CHAR *ss = s + m - 1; + const STRINGLIB_CHAR *pp = p + m - 1; + + /* create compressed boyer-moore delta 1 table */ + + /* process pattern[:-1] */ + for (i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p[mlast]) + skip = mlast - i - 1; } + /* process pattern[-1] outside the loop */ + STRINGLIB_BLOOM_ADD(mask, p[mlast]); - Py_ssize_t w = n - m; - Py_ssize_t mlast = m - 1; - Py_ssize_t skip = mlast - 1; - unsigned long mask = 0; - Py_ssize_t i, j; + for (i = 0; i <= w; i++) { + /* note: using mlast in the skip path slows things down on x86 */ + if (ss[i] == pp[0]) { + /* candidate match */ + for (j = 0; j < mlast; j++) + if (s[i+j] != p[j]) + break; + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) + return i; + count++; + if (count == maxcount) + return maxcount; + i = i + mlast; + continue; + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) + i = i + m; + else + i = i + skip; + } else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) + i = i + m; + } + } + } else { /* FAST_RSEARCH */ /* create compressed boyer-moore delta 1 table */ @@ -598,8 +663,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, i = i - m; } } - return -1; } + + if (mode != FAST_COUNT) + return -1; + return count; } #undef LOG From 89bdc3459cc63cc2c2d8be6ef9ec8b66f9550d31 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 17 Oct 2020 04:27:44 -0400 Subject: [PATCH 12/19] Minor code cleanups --- Objects/stringlib/fastsearch.h | 62 +++++++++++++++------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 87419168f16f87..df75b2f07814c2 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -2,17 +2,24 @@ #define STRINGLIB_FASTSEARCH_H +/* fast search/count implementation, based on a mix between boyer- + moore and horspool, with a few more bells and whistles on the top. + for some more background, see: http://effbot.org/zone/stringlib.htm */ -/* FAST_SEARCH and FAST_COUNT use the two-way algorithm. See: - http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 +/* When the needle/pattern is long enough during the a forward search + or count, use the more complex Two-Way algorithm, which leverages + patterns in the string to ensure no worse than linear time. + Additionally, a Boyer-Moore bad-character shift table is computed + so that sublinear (as in O(n/m)) time is achieved in more cases. + References: + http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm - Largely influenced by glibc: + This implementation was largely influenced by glibc: https://code.woboq.org/userspace/glibc/string/str-two-way.h.html https://code.woboq.org/userspace/glibc/string/memmem.c.html - - FAST_RSEARCH uses another algorithm, based on a mix between boyer- - moore and horspool, with a few more bells and whistles on the top. - for some more background, see: http://effbot.org/zone/stringlib.htm */ + Discussion here: + https://bugs.python.org/issue41972 + */ #define FAST_COUNT 0 #define FAST_SEARCH 1 @@ -170,7 +177,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) #undef MEMCHR_CUT_OFF - +// Preprocessing steps for the two-way algorithm. Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, int inverted, Py_ssize_t *return_period) @@ -216,7 +223,8 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, +STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, + Py_ssize_t needle_len, Py_ssize_t *return_period) { /* Morally, this is what we want to happen: @@ -243,6 +251,7 @@ STRINGLIB(_critical_factorization)(const STRINGLIB_CHAR *needle, Py_ssize_t need } } + #define SHIFT_TYPE uint16_t #define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) #define SHIFT_OVERFLOW (NOT_FOUND - 1U) @@ -255,14 +264,13 @@ Py_LOCAL_INLINE(void) STRINGLIB(_init_table)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, SHIFT_TYPE *table) { - // Fill the table with TABLE_MASK + // Fill the table with NOT_FOUND memset(table, 0xff, TABLE_SIZE * sizeof(SHIFT_TYPE)); assert(table[0] == NOT_FOUND); assert(table[TABLE_SIZE - 1] == NOT_FOUND); for (Py_ssize_t j = 0; j < needle_len; j++) { - // CODE: // TABLE_MASK means not in string - // TABLE_MASK-1 means shift at least TABLE_MASK-1 + // SHIFT_OVERFLOW means shift at least SHIFT_OVERFLOW Py_ssize_t shift = needle_len - j - 1; if (shift > SHIFT_OVERFLOW) { shift = SHIFT_OVERFLOW; @@ -271,6 +279,7 @@ STRINGLIB(_init_table)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, } } + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, @@ -430,6 +439,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, return -1; } + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) @@ -481,32 +491,13 @@ STRINGLIB(_fastcount)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, Py_ssize_t maxcount) { - if (maxcount == 0) { - return 0; - } - if (needle_len == 1) { - Py_ssize_t count = 0; - for (Py_ssize_t i = 0; i < haystack_len; i++) { - if (haystack[i] == needle[0]) { - count++; - if (count == maxcount) { - return maxcount; - } - } - } - return count; - } - if (needle_len == 0) { - return haystack_len + 1; - } STRINGLIB_CHAR first = needle[0]; - Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len, first); + Py_ssize_t index = STRINGLIB(find_char)(haystack, + haystack_len - needle_len + 1, + first); if (index == -1) { return 0; } - if (haystack_len < needle_len + index) { - return -1; - } Py_ssize_t suffix, period; suffix = STRINGLIB(_critical_factorization)(needle, needle_len, &period); SHIFT_TYPE shift_table[TABLE_SIZE]; @@ -588,6 +579,9 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return STRINGLIB(_fastcount)(p, m, s, n, maxcount); } } + + /* Short needles use Fredrik Lundh's Horspool/Sunday hybrid + aglorithm for less overhead. */ const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; From 9d7bbc38ddcdc640779e511f785a461d08cadf14 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 17 Oct 2020 15:26:23 -0400 Subject: [PATCH 13/19] Restore comment and fix typo --- Objects/stringlib/fastsearch.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index df75b2f07814c2..e342552b23fba0 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -21,6 +21,13 @@ https://bugs.python.org/issue41972 */ +/* note: fastsearch may access s[n], which isn't a problem when using + Python's ordinary string types, but may cause problems if you're + using this code in other contexts. also, the count mode returns -1 + if there cannot possible be a match in the target string, and 0 if + it has actually checked for matches, but didn't find any. callers + beware! */ + #define FAST_COUNT 0 #define FAST_SEARCH 1 #define FAST_RSEARCH 2 @@ -581,7 +588,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } /* Short needles use Fredrik Lundh's Horspool/Sunday hybrid - aglorithm for less overhead. */ + algorithm for less overhead. */ const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; From c3313075db46012570f2e754414ae8942b03a41e Mon Sep 17 00:00:00 2001 From: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> Date: Sat, 17 Oct 2020 16:03:47 -0400 Subject: [PATCH 14/19] Update 2020-10-12-23-46-49.bpo-41972.0pHodE.rst --- .../Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst b/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst index 733e82a7187539..e340d690590e50 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2020-10-12-23-46-49.bpo-41972.0pHodE.rst @@ -1 +1 @@ -Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now use the "Two-Way" string comparison algorithm to avoid quadratic behavior in the worst cases. \ No newline at end of file +Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now use the "Two-Way" string comparison algorithm whenever ``str1`` is long enough, to avoid quadratic behavior in the worst cases. From 66377cee02500bafc633bc343ad63f5801577de7 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 17 Oct 2020 19:37:21 -0400 Subject: [PATCH 15/19] Add test cases catered to the new algorithm --- Lib/test/string_tests.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 527f505c0169b3..f945afb7934abd 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -5,6 +5,7 @@ import unittest, string, sys, struct from test import support from collections import UserList +import random class Sequence: def __init__(self, seq='wxyz'): self.seq = seq @@ -317,6 +318,45 @@ def test_rindex(self): else: self.checkraises(TypeError, 'hello', 'rindex', 42) + def test_find_periodic_pattern(self): + """Cover the special path for periodic patterns.""" + def reference_find(p, s): + m = len(p) + for i in range(len(s)): + if s[i:i+m] == p: + return i + return -1 + + rr = random.randrange + choices = random.choices + for _ in range(1000): + p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20) + p = p0[:len(p0) - rr(10)] # pop off some characters + left = ''.join(choices('abcdef', k=rr(200))) + right = ''.join(choices('abcdef', k=rr(200))) + text = left + p + right + with self.subTest(p=p, text=text): + self.checkequal(reference_find(p, text), + text, 'find', p) + + def test_find_shift_table_overflow(self): + """When the table of 16-bit shifts overflows.""" + N = 2**16 + 100 # Overflow the 16-bit shift table + + # first check the periodic case + # here, the shift for 'b' is N. + pattern1 = 'a' * N + 'b' + 'a' * N + text1 = 'babbaa' * N + pattern1 + self.checkequal(len(text1)-len(pattern1), + text1, 'find', pattern1) + + # now check the non-periodic case + # here, the shift for 'd' is 3*(N+1) + pattern2 = 'ddd' + 'abc' * N + "eee" + text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N + self.checkequal(len(text2) - N*len("de") - len(pattern2), + text2, 'find', pattern2) + def test_lower(self): self.checkequal('hello', 'HeLLo', 'lower') self.checkequal('hello', 'hello', 'lower') From cf4e398e941fe810eb0bea00d4ad9c87fb2f98bc Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 17 Oct 2020 19:40:42 -0400 Subject: [PATCH 16/19] Fix typo --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index e342552b23fba0..35e60d81754f41 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -24,7 +24,7 @@ /* note: fastsearch may access s[n], which isn't a problem when using Python's ordinary string types, but may cause problems if you're using this code in other contexts. also, the count mode returns -1 - if there cannot possible be a match in the target string, and 0 if + if there cannot possibly be a match in the target string, and 0 if it has actually checked for matches, but didn't find any. callers beware! */ From 39339921ee7a52001f5fd7636637c81052b639da Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 19 Oct 2020 01:23:46 -0400 Subject: [PATCH 17/19] add a cutoff for haystack length --- Objects/stringlib/fastsearch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 35e60d81754f41..95b2e546153ff4 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -577,8 +577,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (m >= 10) { - /* long needles get the two-way algorithm. */ + if (n >= 500 && m >= 10) { + /* long needles/haystacks get the two-way algorithm. */ if (mode == FAST_SEARCH) { return STRINGLIB(_fastsearch)(p, m, s, n); } From c8e54c6eb86bf80e163543040f72eefd59e54bd2 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 19 Oct 2020 01:25:47 -0400 Subject: [PATCH 18/19] simplify a couple of lines --- Objects/stringlib/fastsearch.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 95b2e546153ff4..97722c2affb412 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -451,14 +451,9 @@ Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_fastsearch)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len) { - if (needle_len == 0) { - return -1; - } - STRINGLIB_CHAR first = needle[0]; - Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len - needle_len + 1, - first); + needle[0]); if (index == -1) { return -1; } @@ -498,10 +493,9 @@ STRINGLIB(_fastcount)(const STRINGLIB_CHAR *needle, Py_ssize_t needle_len, const STRINGLIB_CHAR *haystack, Py_ssize_t haystack_len, Py_ssize_t maxcount) { - STRINGLIB_CHAR first = needle[0]; Py_ssize_t index = STRINGLIB(find_char)(haystack, haystack_len - needle_len + 1, - first); + needle[0]); if (index == -1) { return 0; } From 820885368bca9ec01b5536bbd9fc789ba1e9bf3e Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 19 Oct 2020 05:19:28 -0400 Subject: [PATCH 19/19] Add better threshholds --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 97722c2affb412..8991e3e9086323 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -571,7 +571,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (n >= 500 && m >= 10) { + if (n >= 4000 && m >= 20) { /* long needles/haystacks get the two-way algorithm. */ if (mode == FAST_SEARCH) { return STRINGLIB(_fastsearch)(p, m, s, n);