From 9da86426980af3f11ca36d6b28624f278fd38f42 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 17 Oct 2020 18:03:13 -0400 Subject: [PATCH 01/19] fix typo --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 56a4467d353813..d6d4a253117596 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -9,7 +9,7 @@ /* note: fastsearch may access s[n], which isn't a problem when using Python's ordinary string types, but may cause problems if you're using this code in other contexts. also, the count mode returns -1 - if there cannot possible be a match in the target string, and 0 if + if there cannot possibly be a match in the target string, and 0 if it has actually checked for matches, but didn't find any. callers beware! */ From da70219666e427cbf0ceb33554a9536896d16735 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Wed, 21 Oct 2020 05:07:46 -0400 Subject: [PATCH 02/19] initial implementation --- Objects/stringlib/fastsearch.h | 212 +++++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index d6d4a253117596..168ffa4af9a5c4 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -160,6 +160,215 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) #undef MEMCHR_CUT_OFF +/* Change to a 1 to see logging comments walk through the algorithm. */ +#if 0 && STRINGLIB_SIZEOF_CHAR == 1 +# define LOG(...) printf(__VA_ARGS__) +# define LOG_STRING(s, n) printf("\"%.*s\"", n, s) +#else +# define LOG(...) +# define LOG_STRING(s, n) +#endif + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, + Py_ssize_t *return_period, int invert_alphabet) +{ + /* Do a lexicographic search. Essentially this: + >>> max(needle[i:] for i in range(len(needle)+1)) + Also find the period of the right half. + */ + Py_ssize_t max_suffix = 0; + Py_ssize_t candidate = 1; + Py_ssize_t k = 0; + // the minimal local period around max_suffix + Py_ssize_t period = 1; + + while (candidate + k < len_needle) { + STRINGLIB_CHAR a = needle[candidate + k]; + STRINGLIB_CHAR b = needle[max_suffix + k]; + if (invert_alphabet ? (b < a) : (a < b)) { + // Fell short of max_suffix. + + // The next k + 1 characters are non-increasing + // from candidate, so they won't start a maximal suffix. + candidate += k + 1; + k = 0; + + // We've ruled out any period smaller than what's + // been scanned since max_suffix. + period = candidate - max_suffix; + } + else if (a == b) { + if (k + 1 != period) { + // Keep scanning + k++; + } + else { + // Matched a whole period. + // Start matching the next period. + candidate += period; + k = 0; + } + } + else { + // Did better than max_suffix, so replace it. + max_suffix = candidate; + candidate++; + k = 0; + period = 1; + } + } + *return_period = period; + return max_suffix; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t *return_period) +{ + /* Do a "critical factorization", making it so that: + >>> needle = (left := needle[:cut]) + (right := needle[cut:]) + where the "local period" of the cut is maximal. + + The local period of the cut is the minimal length of a string w + such that (left endswith w or w endswith left) + and (right startswith w or w startswith left). + + The Critical Factorization Theorem says that this maximal local + period is the global period of the string. + + Crochemore and Perrin (1991) show that this cut can be computed + as the later of two cuts: one that gives a lexicographically + maximal right half, and one that gives the same with the + with respect to a reversed alphabet-ordering. + + This is what we want to happen: + >>> x = "GCAGAGAG" + >>> cut, period = factorize(x) + >>> x[:cut], (right := x[cut:]) + ('GC', 'AGAGAG') + >>> period + 2 + >>> right[period:] == right[:-period] + True + + This is how the local period lines up in the above example: + GC | AGAGAG + AGAGAGC = AGAGAGC + The length of this minimal repetition is 7, which is indeed the + period of the original string. */ + + Py_ssize_t cut1, period1, cut2, period2, cut, period; + cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0); + cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1); + + // Take the later cut. + if (cut1 > cut2) { + period = period1; + cut = cut1; + } + else { + period = period2; + cut = cut2; + } + + LOG("split: "); LOG_STRING(needle, cut); + LOG(" + "); LOG_STRING(needle + cut, len_needle - cut); + LOG("\n"); + + *return_period = period; + return cut; +} + + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, Py_ssize_t len_needle) +{ + // Crochemore and Perrin's (1991) Two-Way algorithm. + // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + LOG("===== Checking \"%s\" in \"%s\". =====\n", needle, haystack); + + Py_ssize_t cut, period; + cut = STRINGLIB(_factorize)(needle, len_needle, &period); + + if (memcmp(needle, needle + period, cut * STRINGLIB_SIZEOF_CHAR) == 0) { + LOG("Needle is periodic.\n"); + Py_ssize_t j = 0; + Py_ssize_t memory = 0; + while (j <= len_haystack - len_needle) { + Py_ssize_t i = Py_MAX(cut, memory); + + // Visualize the line-up: + LOG("> "); LOG_STRING(haystack, len_haystack); + LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, len_needle); + LOG("\n> "); LOG("%*s", j + i, ""); LOG(" ^ <-- start\n"); + + while (i < len_needle && needle[i] == haystack[j + i]) { + i++; + } + if (i >= len_needle) { + LOG("Right half matches.\n"); + i = cut - 1; + while (i >= memory && needle[i] == haystack[j + i]) { + i--; + } + if (i < memory) { + LOG("Left half matches. Returning %d.\n", j); + return j; + } + LOG("Left half does not match. Jump ahead by period %d.\n", period); + j += period; + memory = len_needle - period; + } + else { + LOG("Right half does not match. Jump ahead by %d.\n", i - cut + 1); + j += i - cut + 1; + memory = 0; + } + } + } + else { + LOG("Needle is not periodic.\n"); + period = Py_MAX(cut, len_needle - cut) + 1; + Py_ssize_t j = 0; + while (j <= len_haystack - len_needle) { + + // Visualize the line-up: + LOG("> "); LOG_STRING(haystack, len_haystack); + LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, len_needle); + LOG("\n> "); LOG("%*s", j + cut, ""); LOG(" ^ <-- start\n"); + + Py_ssize_t i = cut; + while (i < len_needle && needle[i] == haystack[j + i]) { + i++; + } + if (i >= len_needle) { + LOG("Right half matches.\n"); + i = cut - 1; + while (i >= 0 && needle[i] == haystack[j + i]) { + i--; + } + if (i < 0){ + LOG("Left half matches. Returning %d.\n", j); + return j; + } + LOG("Left half does not match. Advance by %d.\n", period); + j += period; + } + else { + LOG("Right half does not match. Advance by %d.\n", i - cut + 1); + j += i - cut + 1; + } + } + } + return -1; +} + +#undef LOG +#undef LOG_STRING + Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, @@ -199,6 +408,9 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { + if (mode == FAST_SEARCH) { + return STRINGLIB(_two_way)(s, n, p, m); + } const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; From 9062e6d30575cf440656a549a59792304087e3c7 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Wed, 21 Oct 2020 07:13:46 -0400 Subject: [PATCH 03/19] Add count and find functions --- Objects/stringlib/fastsearch.h | 115 ++++++++++++++++++++++++++++++--- 1 file changed, 106 insertions(+), 9 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 168ffa4af9a5c4..4ceb20e02c380d 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -248,7 +248,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, >>> cut, period = factorize(x) >>> x[:cut], (right := x[cut:]) ('GC', 'AGAGAG') - >>> period + >>> period # right half period 2 >>> right[period:] == right[:-period] True @@ -281,19 +281,40 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, return cut; } +typedef struct STRINGLIB(_pre) { + const STRINGLIB_CHAR *needle; + Py_ssize_t len_needle; + Py_ssize_t cut; + Py_ssize_t period; + int is_periodic; +} STRINGLIB(prework); + + +Py_LOCAL_INLINE(void) +STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, + STRINGLIB(prework) *p) +{ + p->needle = needle; + p->len_needle = len_needle; + p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period)); + p->is_periodic = (0 == memcmp(needle, + needle + p->period, + p->cut * STRINGLIB_SIZEOF_CHAR)); +} Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, Py_ssize_t len_needle) + STRINGLIB(prework) *p) { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + const STRINGLIB_CHAR *needle = p->needle; + Py_ssize_t len_needle = p->len_needle; + Py_ssize_t cut = p->cut; + Py_ssize_t period = p->period; LOG("===== Checking \"%s\" in \"%s\". =====\n", needle, haystack); - Py_ssize_t cut, period; - cut = STRINGLIB(_factorize)(needle, len_needle, &period); - - if (memcmp(needle, needle + period, cut * STRINGLIB_SIZEOF_CHAR) == 0) { + if (p->is_periodic) { LOG("Needle is periodic.\n"); Py_ssize_t j = 0; Py_ssize_t memory = 0; @@ -330,8 +351,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } } else { - LOG("Needle is not periodic.\n"); period = Py_MAX(cut, len_needle - cut) + 1; + LOG("Needle is not periodic.\n"); Py_ssize_t j = 0; while (j <= len_haystack - len_needle) { @@ -354,7 +375,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("Left half matches. Returning %d.\n", j); return j; } - LOG("Left half does not match. Advance by %d.\n", period); + LOG("Left half does not match. Advance by period %d.\n", period); j += period; } else { @@ -363,9 +384,82 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } } } + LOG("Not found. Returning -1.\n"); return -1; } +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, Py_ssize_t len_needle) +{ + LOG(">>> Counting \"%s\" in \"%s\".\n", needle, haystack); + Py_ssize_t index; + index = STRINGLIB(find_char)(haystack, + len_haystack - len_needle + 1, + needle[0]); + if (index == -1) { + return -1; + } + if (0 == memcmp(haystack + index, + needle, + len_needle * STRINGLIB_SIZEOF_CHAR)) { + return index; + } + else { + index++; + } + STRINGLIB(prework) p; + STRINGLIB(_preprocess)(needle, len_needle, &p); + Py_ssize_t result; + result = STRINGLIB(_two_way)(haystack + index, len_haystack - index, &p); + if (result == -1) { + return -1; + } + return result + index; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, + Py_ssize_t maxcount) +{ + LOG(">>> Counting \"%s\" in \"%s\".\n", needle, haystack); + Py_ssize_t index; + Py_ssize_t count = 0; + index = STRINGLIB(find_char)(haystack, + len_haystack - len_needle + 1, + needle[0]); + if (index == -1) { + return -1; + } + if (0 == memcmp(haystack + index, + needle, + len_needle * STRINGLIB_SIZEOF_CHAR)) { + count++; + index += len_needle; + if (count == maxcount || index + len_needle > len_haystack) { + return count; + } + } + STRINGLIB(prework) p; + STRINGLIB(_preprocess)(needle, len_needle, &p); + while (index + len_needle <= len_haystack) { + Py_ssize_t result; + result = STRINGLIB(_two_way)(haystack + index, + len_haystack - index, &p); + if (result == -1) { + return count; + } + count++; + if (count == maxcount) { + return maxcount; + } + index += result + len_needle; + } + return count; +} + + #undef LOG #undef LOG_STRING @@ -409,7 +503,10 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, if (mode != FAST_RSEARCH) { if (mode == FAST_SEARCH) { - return STRINGLIB(_two_way)(s, n, p, m); + return STRINGLIB(_find)(s, n, p, m); + } + if (mode == FAST_COUNT) { + return STRINGLIB(_count)(s, n, p, m, maxcount); } const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; From d0d820aa97bbba08f1a12c13467bd6de21774d7d Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 22 Oct 2020 04:32:54 -0400 Subject: [PATCH 04/19] add jump table --- Objects/stringlib/fastsearch.h | 74 ++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 4ceb20e02c380d..06caa166b5abfe 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -13,6 +13,10 @@ it has actually checked for matches, but didn't find any. callers beware! */ +/* If the needle is long enough, use Crochemore and Perrin's Two-Way + algorithm, which has guaranteed O(n) runtime. Also compute a table + of shifts to sometimes achieve O(n/m) runtime in the best cases. */ + #define FAST_COUNT 0 #define FAST_SEARCH 1 #define FAST_RSEARCH 2 @@ -281,12 +285,22 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, return cut; } +#define USE_TABLE +#define SHIFT_TYPE uint16_t +#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) +#define SHIFT_OVERFLOW (NOT_FOUND - 1U) + +#define TABLE_SIZE_BITS 7 +#define TABLE_SIZE (1U << TABLE_SIZE_BITS) +#define TABLE_MASK (TABLE_SIZE - 1U) + typedef struct STRINGLIB(_pre) { const STRINGLIB_CHAR *needle; Py_ssize_t len_needle; Py_ssize_t cut; Py_ssize_t period; int is_periodic; + SHIFT_TYPE table[TABLE_SIZE]; } STRINGLIB(prework); @@ -300,6 +314,17 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, p->is_periodic = (0 == memcmp(needle, needle + p->period, p->cut * STRINGLIB_SIZEOF_CHAR)); + // Now fill up a table + memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); + assert(p->table[0] == NOT_FOUND); + assert(p->table[TABLE_MASK] == NOT_FOUND); + for (Py_ssize_t i = 0; i < len_needle; i++) { + Py_ssize_t shift = len_needle - i; + if (shift > SHIFT_OVERFLOW) { + shift = SHIFT_OVERFLOW; + } + p->table[needle[i] & TABLE_MASK] = shift; + } } Py_LOCAL_INLINE(Py_ssize_t) @@ -324,7 +349,25 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // Visualize the line-up: LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, len_needle); - LOG("\n> "); LOG("%*s", j + i, ""); LOG(" ^ <-- start\n"); + LOG("\n> "); LOG("%*s", j + i, ""); LOG(" ^ <-- cut\n"); + + if (haystack[j + i] != needle[i++]) { + // Sunday's trick: if we're going to jump, we might + // as well jump to line up the character *after* the + // current window. + STRINGLIB_CHAR first_outside = haystack[j + len_needle]; + SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; + if (shift == NOT_FOUND) { + LOG("\"%c\" not found. Skipping entirely.\n", first_outside); + j += len_needle + 1; + } + else { + LOG("Shifting to line up \"%c\".\n", first_outside); + j += shift; + } + memory = 0; + continue; + } while (i < len_needle && needle[i] == haystack[j + i]) { i++; @@ -354,14 +397,33 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, period = Py_MAX(cut, len_needle - cut) + 1; LOG("Needle is not periodic.\n"); Py_ssize_t j = 0; + assert(cut < len_needle); + STRINGLIB_CHAR needle_cut = needle[cut]; while (j <= len_haystack - len_needle) { // Visualize the line-up: LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, len_needle); - LOG("\n> "); LOG("%*s", j + cut, ""); LOG(" ^ <-- start\n"); + LOG("\n> "); LOG("%*s", j + cut, ""); LOG(" ^ <-- cut\n"); + + if (haystack[j + cut] != needle_cut) { + // Sunday's trick: if we're going to jump, we might + // as well jump to line up the character *after* the + // current window. + STRINGLIB_CHAR first_outside = haystack[j + len_needle]; + SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; + if (shift == NOT_FOUND) { + LOG("\"%c\" not found. Skipping entirely.\n", first_outside); + j += len_needle + 1; + } + else { + LOG("Shifting to line up \"%c\".\n", first_outside); + j += shift; + } + continue; + } - Py_ssize_t i = cut; + Py_ssize_t i = cut + 1; while (i < len_needle && needle[i] == haystack[j + i]) { i++; } @@ -459,6 +521,12 @@ STRINGLIB(_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, return count; } +#undef SHIFT_TYPE +#undef NOT_FOUND +#undef SHIFT_OVERFLOW +#undef TABLE_SIZE_BITS +#undef TABLE_SIZE +#undef TABLE_MASK #undef LOG #undef LOG_STRING From bab1833ce31576394b224ebbd05bccb88703940f Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 22 Oct 2020 04:59:49 -0400 Subject: [PATCH 05/19] Add cutoff --- Objects/stringlib/fastsearch.h | 37 +++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 06caa166b5abfe..125ce0b06073e9 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -451,10 +451,12 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, Py_ssize_t len_needle) +STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle) { - LOG(">>> Counting \"%s\" in \"%s\".\n", needle, haystack); + LOG("##### Counting \"%s\" in \"%s\".\n", needle, haystack); Py_ssize_t index; index = STRINGLIB(find_char)(haystack, len_haystack - len_needle + 1, @@ -464,7 +466,8 @@ STRINGLIB(_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } if (0 == memcmp(haystack + index, needle, - len_needle * STRINGLIB_SIZEOF_CHAR)) { + len_needle * STRINGLIB_SIZEOF_CHAR)) + { return index; } else { @@ -481,11 +484,13 @@ STRINGLIB(_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, - const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, - Py_ssize_t maxcount) +STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, + Py_ssize_t len_haystack, + const STRINGLIB_CHAR *needle, + Py_ssize_t len_needle, + Py_ssize_t maxcount) { - LOG(">>> Counting \"%s\" in \"%s\".\n", needle, haystack); + LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); Py_ssize_t index; Py_ssize_t count = 0; index = STRINGLIB(find_char)(haystack, @@ -496,7 +501,8 @@ STRINGLIB(_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } if (0 == memcmp(haystack + index, needle, - len_needle * STRINGLIB_SIZEOF_CHAR)) { + len_needle * STRINGLIB_SIZEOF_CHAR)) + { count++; index += len_needle; if (count == maxcount || index + len_needle > len_haystack) { @@ -570,11 +576,14 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (mode == FAST_SEARCH) { - return STRINGLIB(_find)(s, n, p, m); - } - if (mode == FAST_COUNT) { - return STRINGLIB(_count)(s, n, p, m, maxcount); + if (w >= 2000 && m >= 20) { + // For larger problems, use a worst-case-linear algorithm. + if (mode == FAST_SEARCH) { + return STRINGLIB(_two_way_find)(s, n, p, m); + } + else { + return STRINGLIB(_two_way_count)(s, n, p, m, maxcount); + } } const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; From bf26486456b93da62a67d9908e5a65ae1a77a6d4 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 22 Oct 2020 19:57:33 -0400 Subject: [PATCH 06/19] Add test cases --- Lib/test/string_tests.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 527f505c0169b3..f945afb7934abd 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -5,6 +5,7 @@ import unittest, string, sys, struct from test import support from collections import UserList +import random class Sequence: def __init__(self, seq='wxyz'): self.seq = seq @@ -317,6 +318,45 @@ def test_rindex(self): else: self.checkraises(TypeError, 'hello', 'rindex', 42) + def test_find_periodic_pattern(self): + """Cover the special path for periodic patterns.""" + def reference_find(p, s): + m = len(p) + for i in range(len(s)): + if s[i:i+m] == p: + return i + return -1 + + rr = random.randrange + choices = random.choices + for _ in range(1000): + p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20) + p = p0[:len(p0) - rr(10)] # pop off some characters + left = ''.join(choices('abcdef', k=rr(200))) + right = ''.join(choices('abcdef', k=rr(200))) + text = left + p + right + with self.subTest(p=p, text=text): + self.checkequal(reference_find(p, text), + text, 'find', p) + + def test_find_shift_table_overflow(self): + """When the table of 16-bit shifts overflows.""" + N = 2**16 + 100 # Overflow the 16-bit shift table + + # first check the periodic case + # here, the shift for 'b' is N. + pattern1 = 'a' * N + 'b' + 'a' * N + text1 = 'babbaa' * N + pattern1 + self.checkequal(len(text1)-len(pattern1), + text1, 'find', pattern1) + + # now check the non-periodic case + # here, the shift for 'd' is 3*(N+1) + pattern2 = 'ddd' + 'abc' * N + "eee" + text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N + self.checkequal(len(text2) - N*len("de") - len(pattern2), + text2, 'find', pattern2) + def test_lower(self): self.checkequal('hello', 'HeLLo', 'lower') self.checkequal('hello', 'hello', 'lower') From bf15339e2af2f6dbdfbb1fdb2ea05e2ff879e03b Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 23 Oct 2020 23:17:26 +0000 Subject: [PATCH 07/19] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst b/Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst new file mode 100644 index 00000000000000..609a0ff0be253e --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-10-23-23-17-23.bpo-41972.kbAwg4.rst @@ -0,0 +1 @@ +Substring search functions such as ``str1 in str2`` and ``str2.find(str1)`` now sometimes use the "Two-Way" string comparison algorithm to avoid quadratic behavior on long strings. \ No newline at end of file From f5afc66ee726b18e01edb9c8cb186e45365f1280 Mon Sep 17 00:00:00 2001 From: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> Date: Fri, 23 Oct 2020 21:31:41 -0400 Subject: [PATCH 08/19] Add safe downcasting to shift table initialization Co-authored-by: Tim Peters --- Objects/stringlib/fastsearch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 125ce0b06073e9..df1e2dc90dcaa7 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -323,7 +323,8 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, if (shift > SHIFT_OVERFLOW) { shift = SHIFT_OVERFLOW; } - p->table[needle[i] & TABLE_MASK] = shift; + p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift, + Py_ssize_t, SHIFT_TYPE); } } @@ -666,4 +667,3 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return -1; return count; } - From ed56aa0e384e061238bd11febb3459e795c1bf9c Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Fri, 23 Oct 2020 21:38:37 -0400 Subject: [PATCH 09/19] More precise comment --- Objects/stringlib/fastsearch.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index df1e2dc90dcaa7..2f3d898b7db9c1 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -13,9 +13,11 @@ it has actually checked for matches, but didn't find any. callers beware! */ -/* If the needle is long enough, use Crochemore and Perrin's Two-Way - algorithm, which has guaranteed O(n) runtime. Also compute a table - of shifts to sometimes achieve O(n/m) runtime in the best cases. */ +/* If the strings are long enough, use Crochemore and Perrin's Two-Way + algorithm, which has worst-case O(n) runtime and best-case O(n/k). + Also compute a table of shifts to achieve O(n/k) in more cases, + and often (data dependent) deduce larger shifts than pure C&P can + deduce. */ #define FAST_COUNT 0 #define FAST_SEARCH 1 From f4697667341e63bf27824bb2fbae05e7bf9318b5 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 24 Oct 2020 04:20:46 -0400 Subject: [PATCH 10/19] code cleanups, increment a 'window' pointer rather than an index --- Objects/stringlib/fastsearch.h | 90 +++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 2f3d898b7db9c1..6c18986eac6702 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -287,7 +287,6 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, return cut; } -#define USE_TABLE #define SHIFT_TYPE uint16_t #define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) #define SHIFT_OVERFLOW (NOT_FOUND - 1U) @@ -326,7 +325,8 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, shift = SHIFT_OVERFLOW; } p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift, - Py_ssize_t, SHIFT_TYPE); + Py_ssize_t, + SHIFT_TYPE); } } @@ -336,62 +336,69 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - const STRINGLIB_CHAR *needle = p->needle; Py_ssize_t len_needle = p->len_needle; Py_ssize_t cut = p->cut; Py_ssize_t period = p->period; - LOG("===== Checking \"%s\" in \"%s\". =====\n", needle, haystack); + const STRINGLIB_CHAR *needle = p->needle; + const STRINGLIB_CHAR *window = haystack; + const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle; + LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); if (p->is_periodic) { LOG("Needle is periodic.\n"); - Py_ssize_t j = 0; Py_ssize_t memory = 0; - while (j <= len_haystack - len_needle) { + while (window <= last_window) { Py_ssize_t i = Py_MAX(cut, memory); // Visualize the line-up: LOG("> "); LOG_STRING(haystack, len_haystack); - LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, len_needle); - LOG("\n> "); LOG("%*s", j + i, ""); LOG(" ^ <-- cut\n"); + LOG("\n> "); LOG("%*s", window - haystack, ""); + LOG_STRING(needle, len_needle); + LOG("\n> "); LOG("%*s", window - haystack + i, ""); + LOG(" ^ <-- cut\n"); - if (haystack[j + i] != needle[i++]) { + if (window[i] != needle[i++]) { // Sunday's trick: if we're going to jump, we might // as well jump to line up the character *after* the // current window. - STRINGLIB_CHAR first_outside = haystack[j + len_needle]; + STRINGLIB_CHAR first_outside = window[len_needle]; SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; if (shift == NOT_FOUND) { - LOG("\"%c\" not found. Skipping entirely.\n", first_outside); - j += len_needle + 1; + LOG("\"%c\" not found. Skipping entirely.\n", + first_outside); + window += len_needle + 1; } else { LOG("Shifting to line up \"%c\".\n", first_outside); - j += shift; + window += shift; } memory = 0; continue; } - while (i < len_needle && needle[i] == haystack[j + i]) { + while (i < len_needle && needle[i] == window[i]) { i++; } if (i >= len_needle) { LOG("Right half matches.\n"); i = cut - 1; - while (i >= memory && needle[i] == haystack[j + i]) { + while (i >= memory && needle[i] == window[i]) { i--; } if (i < memory) { - LOG("Left half matches. Returning %d.\n", j); - return j; + LOG("Left half matches. Returning %d.\n", + window - haystack); + return window - haystack; } - LOG("Left half does not match. Jump ahead by period %d.\n", period); - j += period; + LOG("Left half does not match. Jump ahead by period %d.\n", + period); + window += period; memory = len_needle - period; } else { - LOG("Right half does not match. Jump ahead by %d.\n", i - cut + 1); - j += i - cut + 1; + LOG("Right half does not match. Jump ahead by %d.\n", + i - cut + 1); + window += i - cut + 1; memory = 0; } } @@ -399,53 +406,58 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, else { period = Py_MAX(cut, len_needle - cut) + 1; LOG("Needle is not periodic.\n"); - Py_ssize_t j = 0; assert(cut < len_needle); STRINGLIB_CHAR needle_cut = needle[cut]; - while (j <= len_haystack - len_needle) { + while (window <= last_window) { // Visualize the line-up: LOG("> "); LOG_STRING(haystack, len_haystack); - LOG("\n> "); LOG("%*s", j, ""); LOG_STRING(needle, len_needle); - LOG("\n> "); LOG("%*s", j + cut, ""); LOG(" ^ <-- cut\n"); + LOG("\n> "); LOG("%*s", window - haystack, ""); + LOG_STRING(needle, len_needle); + LOG("\n> "); LOG("%*s", window - haystack + cut, ""); + LOG(" ^ <-- cut\n"); - if (haystack[j + cut] != needle_cut) { + if (window[cut] != needle_cut) { // Sunday's trick: if we're going to jump, we might // as well jump to line up the character *after* the // current window. - STRINGLIB_CHAR first_outside = haystack[j + len_needle]; + STRINGLIB_CHAR first_outside = window[len_needle]; SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; if (shift == NOT_FOUND) { - LOG("\"%c\" not found. Skipping entirely.\n", first_outside); - j += len_needle + 1; + LOG("\"%c\" not found. Skipping entirely.\n", + first_outside); + window += len_needle + 1; } else { LOG("Shifting to line up \"%c\".\n", first_outside); - j += shift; + window += shift; } continue; } Py_ssize_t i = cut + 1; - while (i < len_needle && needle[i] == haystack[j + i]) { + while (i < len_needle && needle[i] == window[i]) { i++; } if (i >= len_needle) { LOG("Right half matches.\n"); i = cut - 1; - while (i >= 0 && needle[i] == haystack[j + i]) { + while (i >= 0 && needle[i] == window[i]) { i--; } if (i < 0){ - LOG("Left half matches. Returning %d.\n", j); - return j; + LOG("Left half matches. Returning %d.\n", + window - haystack); + return window - haystack; } - LOG("Left half does not match. Advance by period %d.\n", period); - j += period; + LOG("Left half does not match. Advance by period %d.\n", + period); + window += period; } else { - LOG("Right half does not match. Advance by %d.\n", i - cut + 1); - j += i - cut + 1; + LOG("Right half does not match. Advance by %d.\n", + i - cut + 1); + window += i - cut + 1; } } } @@ -459,7 +471,7 @@ STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, const STRINGLIB_CHAR *needle, Py_ssize_t len_needle) { - LOG("##### Counting \"%s\" in \"%s\".\n", needle, haystack); + LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); Py_ssize_t index; index = STRINGLIB(find_char)(haystack, len_haystack - len_needle + 1, From fe9e9d9c1f1c5f98c797d19e2214d1413701f6de Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 25 Oct 2020 14:21:24 -0400 Subject: [PATCH 11/19] Fix sequence point warning; test cases to meet threshold --- Lib/test/string_tests.py | 11 +++++------ Objects/stringlib/fastsearch.h | 4 +++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index f945afb7934abd..8b028d36d2328c 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -321,9 +321,8 @@ def test_rindex(self): def test_find_periodic_pattern(self): """Cover the special path for periodic patterns.""" def reference_find(p, s): - m = len(p) for i in range(len(s)): - if s[i:i+m] == p: + if s.startswith(p, i): return i return -1 @@ -332,8 +331,8 @@ def reference_find(p, s): for _ in range(1000): p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20) p = p0[:len(p0) - rr(10)] # pop off some characters - left = ''.join(choices('abcdef', k=rr(200))) - right = ''.join(choices('abcdef', k=rr(200))) + left = ''.join(choices('abcdef', k=rr(2000))) + right = ''.join(choices('abcdef', k=rr(2000))) text = left + p + right with self.subTest(p=p, text=text): self.checkequal(reference_find(p, text), @@ -344,14 +343,14 @@ def test_find_shift_table_overflow(self): N = 2**16 + 100 # Overflow the 16-bit shift table # first check the periodic case - # here, the shift for 'b' is N. + # here, the shift for 'b' is N + 1. pattern1 = 'a' * N + 'b' + 'a' * N text1 = 'babbaa' * N + pattern1 self.checkequal(len(text1)-len(pattern1), text1, 'find', pattern1) # now check the non-periodic case - # here, the shift for 'd' is 3*(N+1) + # here, the shift for 'd' is 3*(N+1)+1 pattern2 = 'ddd' + 'abc' * N + "eee" text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N self.checkequal(len(text2) - N*len("de") - len(pattern2), diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 6c18986eac6702..10cabbbd6400be 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -357,7 +357,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("\n> "); LOG("%*s", window - haystack + i, ""); LOG(" ^ <-- cut\n"); - if (window[i] != needle[i++]) { + if (window[i] != needle[i]) { // Sunday's trick: if we're going to jump, we might // as well jump to line up the character *after* the // current window. @@ -376,6 +376,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, continue; } + i++; while (i < len_needle && needle[i] == window[i]) { i++; } @@ -681,3 +682,4 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return -1; return count; } + From 40d5217d37bb85f7eaa1cae90f54a3578aaddd82 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 2 Nov 2020 04:48:05 -0500 Subject: [PATCH 12/19] Tighter inner loop; precompute the table --- Objects/stringlib/fastsearch.h | 57 ++++++++++++---------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 10cabbbd6400be..f709bc9a11533c 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -288,12 +288,11 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, } #define SHIFT_TYPE uint16_t -#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) -#define SHIFT_OVERFLOW (NOT_FOUND - 1U) +#define MAX_SHIFT ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) #define TABLE_SIZE_BITS 7 -#define TABLE_SIZE (1U << TABLE_SIZE_BITS) -#define TABLE_MASK (TABLE_SIZE - 1U) +#define TABLE_SIZE (1 << TABLE_SIZE_BITS) +#define TABLE_MASK (TABLE_SIZE - 1) typedef struct STRINGLIB(_pre) { const STRINGLIB_CHAR *needle; @@ -316,17 +315,15 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, needle + p->period, p->cut * STRINGLIB_SIZEOF_CHAR)); // Now fill up a table - memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); - assert(p->table[0] == NOT_FOUND); - assert(p->table[TABLE_MASK] == NOT_FOUND); + SHIFT_TYPE default_shift = Py_SAFE_DOWNCAST(Py_MIN(len_needle, MAX_SHIFT), + Py_ssize_t, SHIFT_TYPE); + for (int i = 0; i < TABLE_SIZE; i++) { + p->table[i] = default_shift; + } for (Py_ssize_t i = 0; i < len_needle; i++) { - Py_ssize_t shift = len_needle - i; - if (shift > SHIFT_OVERFLOW) { - shift = SHIFT_OVERFLOW; - } - p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift, - Py_ssize_t, - SHIFT_TYPE); + Py_ssize_t shift = Py_MIN(MAX_SHIFT, len_needle - i); + int index = needle[i] & TABLE_MASK; + p->table[index] = Py_SAFE_DOWNCAST(shift, Py_ssize_t, SHIFT_TYPE); } } @@ -342,6 +339,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle = p->needle; const STRINGLIB_CHAR *window = haystack; const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle; + SHIFT_TYPE *table = p->table; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); if (p->is_periodic) { @@ -362,16 +360,9 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // as well jump to line up the character *after* the // current window. STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; - if (shift == NOT_FOUND) { - LOG("\"%c\" not found. Skipping entirely.\n", - first_outside); - window += len_needle + 1; - } - else { - LOG("Shifting to line up \"%c\".\n", first_outside); - window += shift; - } + SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; + LOG("Shifting to line up \"%c\".\n", first_outside); + window += shift; memory = 0; continue; } @@ -423,16 +414,9 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // as well jump to line up the character *after* the // current window. STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; - if (shift == NOT_FOUND) { - LOG("\"%c\" not found. Skipping entirely.\n", - first_outside); - window += len_needle + 1; - } - else { - LOG("Shifting to line up \"%c\".\n", first_outside); - window += shift; - } + SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; + LOG("Shifting to line up \"%c\".\n", first_outside); + window += shift; continue; } @@ -544,8 +528,7 @@ STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, } #undef SHIFT_TYPE -#undef NOT_FOUND -#undef SHIFT_OVERFLOW +#undef MAX_SHIFT #undef TABLE_SIZE_BITS #undef TABLE_SIZE #undef TABLE_MASK @@ -588,7 +571,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } mlast = m - 1; - skip = mlast - 1; + skip = mlast; mask = 0; if (mode != FAST_RSEARCH) { From a3c453d2530da8215c13fdedc433dbce9d84d35f Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 2 Nov 2020 16:31:27 -0500 Subject: [PATCH 13/19] Revert "Tighter inner loop; precompute the table" This reverts commit 40d5217d37bb85f7eaa1cae90f54a3578aaddd82. --- Objects/stringlib/fastsearch.h | 57 ++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index f709bc9a11533c..10cabbbd6400be 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -288,11 +288,12 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, } #define SHIFT_TYPE uint16_t -#define MAX_SHIFT ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) +#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) +#define SHIFT_OVERFLOW (NOT_FOUND - 1U) #define TABLE_SIZE_BITS 7 -#define TABLE_SIZE (1 << TABLE_SIZE_BITS) -#define TABLE_MASK (TABLE_SIZE - 1) +#define TABLE_SIZE (1U << TABLE_SIZE_BITS) +#define TABLE_MASK (TABLE_SIZE - 1U) typedef struct STRINGLIB(_pre) { const STRINGLIB_CHAR *needle; @@ -315,15 +316,17 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, needle + p->period, p->cut * STRINGLIB_SIZEOF_CHAR)); // Now fill up a table - SHIFT_TYPE default_shift = Py_SAFE_DOWNCAST(Py_MIN(len_needle, MAX_SHIFT), - Py_ssize_t, SHIFT_TYPE); - for (int i = 0; i < TABLE_SIZE; i++) { - p->table[i] = default_shift; - } + memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); + assert(p->table[0] == NOT_FOUND); + assert(p->table[TABLE_MASK] == NOT_FOUND); for (Py_ssize_t i = 0; i < len_needle; i++) { - Py_ssize_t shift = Py_MIN(MAX_SHIFT, len_needle - i); - int index = needle[i] & TABLE_MASK; - p->table[index] = Py_SAFE_DOWNCAST(shift, Py_ssize_t, SHIFT_TYPE); + Py_ssize_t shift = len_needle - i; + if (shift > SHIFT_OVERFLOW) { + shift = SHIFT_OVERFLOW; + } + p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift, + Py_ssize_t, + SHIFT_TYPE); } } @@ -339,7 +342,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle = p->needle; const STRINGLIB_CHAR *window = haystack; const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle; - SHIFT_TYPE *table = p->table; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); if (p->is_periodic) { @@ -360,9 +362,16 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // as well jump to line up the character *after* the // current window. STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; - LOG("Shifting to line up \"%c\".\n", first_outside); - window += shift; + SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; + if (shift == NOT_FOUND) { + LOG("\"%c\" not found. Skipping entirely.\n", + first_outside); + window += len_needle + 1; + } + else { + LOG("Shifting to line up \"%c\".\n", first_outside); + window += shift; + } memory = 0; continue; } @@ -414,9 +423,16 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // as well jump to line up the character *after* the // current window. STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; - LOG("Shifting to line up \"%c\".\n", first_outside); - window += shift; + SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; + if (shift == NOT_FOUND) { + LOG("\"%c\" not found. Skipping entirely.\n", + first_outside); + window += len_needle + 1; + } + else { + LOG("Shifting to line up \"%c\".\n", first_outside); + window += shift; + } continue; } @@ -528,7 +544,8 @@ STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, } #undef SHIFT_TYPE -#undef MAX_SHIFT +#undef NOT_FOUND +#undef SHIFT_OVERFLOW #undef TABLE_SIZE_BITS #undef TABLE_SIZE #undef TABLE_MASK @@ -571,7 +588,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } mlast = m - 1; - skip = mlast; + skip = mlast - 1; mask = 0; if (mode != FAST_RSEARCH) { From 5e82212784b02accf7c07eff58ae640540d3ac37 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Fri, 6 Nov 2020 19:32:34 -0500 Subject: [PATCH 14/19] m cutoff to 100, n-m cutoff to 5000 --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 10cabbbd6400be..c964b8df54556f 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -592,7 +592,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (w >= 2000 && m >= 20) { + if (m >= 100 && n - m >= 5000) { // For larger problems, use a worst-case-linear algorithm. if (mode == FAST_SEARCH) { return STRINGLIB(_two_way_find)(s, n, p, m); From 5be10c7b4b16d9ff74c7ce5d8e987c797726919c Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 17 Jan 2021 00:37:25 -0500 Subject: [PATCH 15/19] Improve comments, always use memory in periodic case, and remove the initial character scans --- Objects/stringlib/fastsearch.h | 61 ++++++++-------------------------- 1 file changed, 13 insertions(+), 48 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index c964b8df54556f..d15b92cf186de3 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -181,32 +181,31 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, { /* Do a lexicographic search. Essentially this: >>> max(needle[i:] for i in range(len(needle)+1)) - Also find the period of the right half. - */ + Also find the period of the right half. */ Py_ssize_t max_suffix = 0; Py_ssize_t candidate = 1; Py_ssize_t k = 0; - // the minimal local period around max_suffix + // The period of the right half. Py_ssize_t period = 1; while (candidate + k < len_needle) { + // loop increases candidate + k by 1 at each step STRINGLIB_CHAR a = needle[candidate + k]; STRINGLIB_CHAR b = needle[max_suffix + k]; + // check if the suffix at candidate is better than max_suffix if (invert_alphabet ? (b < a) : (a < b)) { // Fell short of max_suffix. - // The next k + 1 characters are non-increasing // from candidate, so they won't start a maximal suffix. candidate += k + 1; k = 0; - // We've ruled out any period smaller than what's // been scanned since max_suffix. period = candidate - max_suffix; } else if (a == b) { if (k + 1 != period) { - // Keep scanning + // Keep scanning the equal strings k++; } else { @@ -312,9 +311,12 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, p->needle = needle; p->len_needle = len_needle; p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period)); + assert(p->period + p->cut <= len_needle); p->is_periodic = (0 == memcmp(needle, needle + p->period, p->cut * STRINGLIB_SIZEOF_CHAR)); + assert(!p->is_periodic || (p->cut <= len_needle/2 + && p->cut < p->period)); // Now fill up a table memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); assert(p->table[0] == NOT_FOUND); @@ -370,7 +372,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } else { LOG("Shifting to line up \"%c\".\n", first_outside); - window += shift; + Py_ssize_t memory_shift = i - cut + 1; + window += Py_MAX(shift, memory_shift); } memory = 0; continue; @@ -473,30 +476,9 @@ STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_needle) { LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack); - Py_ssize_t index; - index = STRINGLIB(find_char)(haystack, - len_haystack - len_needle + 1, - needle[0]); - if (index == -1) { - return -1; - } - if (0 == memcmp(haystack + index, - needle, - len_needle * STRINGLIB_SIZEOF_CHAR)) - { - return index; - } - else { - index++; - } STRINGLIB(prework) p; STRINGLIB(_preprocess)(needle, len_needle, &p); - Py_ssize_t result; - result = STRINGLIB(_two_way)(haystack + index, len_haystack - index, &p); - if (result == -1) { - return -1; - } - return result + index; + return STRINGLIB(_two_way)(haystack, len_haystack, &p); } Py_LOCAL_INLINE(Py_ssize_t) @@ -507,27 +489,10 @@ STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t maxcount) { LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack); - Py_ssize_t index; - Py_ssize_t count = 0; - index = STRINGLIB(find_char)(haystack, - len_haystack - len_needle + 1, - needle[0]); - if (index == -1) { - return -1; - } - if (0 == memcmp(haystack + index, - needle, - len_needle * STRINGLIB_SIZEOF_CHAR)) - { - count++; - index += len_needle; - if (count == maxcount || index + len_needle > len_haystack) { - return count; - } - } STRINGLIB(prework) p; STRINGLIB(_preprocess)(needle, len_needle, &p); - while (index + len_needle <= len_haystack) { + Py_ssize_t index = 0, count = 0; + while (1) { Py_ssize_t result; result = STRINGLIB(_two_way)(haystack + index, len_haystack - index, &p); From 180125318757db709c86cdba5aee982495f155de Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 17 Jan 2021 14:32:46 -0500 Subject: [PATCH 16/19] Make the algorithm adaptive --- Objects/stringlib/fastsearch.h | 128 +++++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 20 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index d15b92cf186de3..50f732e74319c8 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -557,8 +557,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (m >= 100 && n - m >= 5000) { - // For larger problems, use a worst-case-linear algorithm. + if (m >= 100 && w >= 2000 && w / m >= 5) { + /* For larger problems where the needle isn't a huge + percentage of the size of the haystack, the relatively + expensive O(m) startup cost of the two-way algorithm + will surely pay off. */ if (mode == FAST_SEARCH) { return STRINGLIB(_two_way_find)(s, n, p, m); } @@ -574,41 +577,118 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[:-1] */ for (i = 0; i < mlast; i++) { STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[mlast]) + if (p[i] == p[mlast]) { skip = mlast - i - 1; + } } /* process pattern[-1] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[mlast]); + if (m >= 100 && w >= 8000) { + /* To ensure that we have good worst-case behavior, + here's an adaptive version of the algorithm, where if + we match O(m) characters without any matches of the + entire needle, then we predict that the startup cost of + the two-way algorithm will probably be worth it. */ + Py_ssize_t hits = 0; + for (i = 0; i <= w; i++) { + if (ss[i] == pp[0]) { + /* candidate match */ + for (j = 0; j < mlast; j++) { + if (s[i+j] != p[j]) { + break; + } + } + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) { + return i; + } + count++; + if (count == maxcount) { + return maxcount; + } + i = i + mlast; + continue; + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + else { + i = i + skip; + } + hits += j + 1; + if (hits >= m / 4 && i < w - 1000) { + /* We've done O(m) fruitless comparisons + anyway, so spend the O(m) cost on the + setup for the two-way algorithm. */ + Py_ssize_t res; + if (mode == FAST_COUNT) { + res = STRINGLIB(_two_way_count)( + s+i, n-i, p, m, maxcount-count); + return count + res; + } + else { + res = STRINGLIB(_two_way_find)(s+i, n-i, p, m); + if (res == -1) { + return -1; + } + return i + res; + } + } + } + else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + } + } + if (mode != FAST_COUNT) { + return -1; + } + return count; + } + /* The standard, non-adaptive version of the algorithm. */ for (i = 0; i <= w; i++) { /* note: using mlast in the skip path slows things down on x86 */ if (ss[i] == pp[0]) { /* candidate match */ - for (j = 0; j < mlast; j++) - if (s[i+j] != p[j]) + for (j = 0; j < mlast; j++) { + if (s[i+j] != p[j]) { break; + } + } if (j == mlast) { /* got a match! */ - if (mode != FAST_COUNT) + if (mode != FAST_COUNT) { return i; + } count++; - if (count == maxcount) + if (count == maxcount) { return maxcount; + } i = i + mlast; continue; } /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; - else + } + else { i = i + skip; - } else { + } + } + else { /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; + } } } - } else { /* FAST_RSEARCH */ + } + else { /* FAST_RSEARCH */ /* create compressed boyer-moore delta 1 table */ @@ -617,28 +697,36 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[:0:-1] */ for (i = mlast; i > 0; i--) { STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[0]) + if (p[i] == p[0]) { skip = i - 1; + } } for (i = w; i >= 0; i--) { if (s[i] == p[0]) { /* candidate match */ - for (j = mlast; j > 0; j--) - if (s[i+j] != p[j]) + for (j = mlast; j > 0; j--) { + if (s[i+j] != p[j]) { break; - if (j == 0) + } + } + if (j == 0) { /* got a match! */ return i; + } /* miss: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { i = i - m; - else + } + else { i = i - skip; - } else { + } + } + else { /* skip: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { i = i - m; + } } } } From 8fb897949e3c5f7e35c1e9d1ca65e04ddb0831c2 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 17 Jan 2021 17:29:07 -0500 Subject: [PATCH 17/19] Add notes text document --- .../stringlib_find_two_way_notes.txt | 431 ++++++++++++++++++ 1 file changed, 431 insertions(+) create mode 100644 Objects/stringlib/stringlib_find_two_way_notes.txt diff --git a/Objects/stringlib/stringlib_find_two_way_notes.txt b/Objects/stringlib/stringlib_find_two_way_notes.txt new file mode 100644 index 00000000000000..c944d16ca9c8fd --- /dev/null +++ b/Objects/stringlib/stringlib_find_two_way_notes.txt @@ -0,0 +1,431 @@ +This document explains Crochemore and Perrin's Two-Way string matching +algorithm, in which a smaller string (the "pattern" or "needle") +is searched for in a longer string (the "text" or "haystack"), +determining whether the needle is a substring of the haystack, and if +so, at what index(es). It is to be used by Python's string +(and bytes-like) objects when calling `find`, `index`, `__contains__`, +or implicitly in methods like `replace` or `partition`. + +This is essentially a re-telling of the paper + + Crochemore M., Perrin D., 1991, Two-way string-matching, + Journal of the ACM 38(3):651-675. + +focused more on understanding and examples than on rigor. See also +the code sample here: + + http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + +The algorithm runs in O(len(needle) + len(haystack)) time and with +O(1) space. However, since there is a larger preprocessing cost than +simpler algorithms, this Two-Way algorithm is to be used only when the +needle and haystack lengths meet certain thresholds. + + +These are the basic steps of the algorithm: + + * "Very carefully" cut the needle in two. + * For each alignment attempted: + 1. match the right part + * On failure, jump by the amount matched + 1 + 2. then match the left part. + * On failure jump by max(len(left), len(right)) + 1 + * If the needle is periodic, don't re-do comparisons; maintain + a "memory" of how many characters you already know match. + + +-------- Matching the right part -------- + +We first scan the right part of the needle to check if it matches the +the aligned characters in the haystack. We scan left-to-right, +and if a mismatch occurs, we jump ahead by the amount matched plus 1. + +Example: + + text: ........EFGX................... + pattern: ....abcdEFGH.... + cut: <<<<>>>> + +Matched 3, so jump ahead by 4: + + text: ........EFGX................... + pattern: ....abcdEFGH.... + cut: <<<<>>>> + +Why are we allowed to do this? Because we cut the needle very +carefully, in such a way that if the cut is ...abcd + EFGH... then +we have + + d != E + cd != EF + bcd != EFG + abcd != EFGH + ... and so on. + +If this is true for every pair of equal-length substrings around the +cut, then the following alignments do not work, so we can skip them: + + text: ........EFG.................... + pattern: ....abcdEFGH.... + ^ (Bad because d != E) + text: ........EFG.................... + pattern: ....abcdEFGH.... + ^^ (Bad because cd != EF) + text: ........EFG.................... + pattern: ....abcdEFGH.... + ^^^ (Bad because bcd != EFG) + +Skip 3 alignments => increment alignment by 4. + + +-------- If len(left_part) < len(right_part) -------- + +Above is the core idea, and it begins to suggest how the algorithm can +be linear-time. There is one bit of subtlety involving what to do +around the end of the needle: if the left half is shorter than the +right, then we could run into something like this: + + text: .....EFG...... + pattern: cdEFGH + +The same argument holds that we can skip ahead by 4, so long as + + d != E + cd != EF + ?cd != EFG + ??cd != EFGH + etc. + +The question marks represent "wildcards" that always match; they're +outside the limits of the needle, so there's no way for them to +invalidate a match. To ensure that the inequalities above are always +true, we need them to be true for all possible '?' values. We thus +need cd != FG and cd != GH, etc. + + +-------- Matching the left part -------- + +Once we have ensured the right part matches, we scan the left part +(order doesn't matter, but traditionally right-to-left), and if we +find a mismatch, we jump ahead by +max(len(left_part), len(right_part)) + 1. That we can jump by +at least len(right_part) + 1 we have already seen: + + text: .....EFG..... + pattern: abcdEFG + Matched 3, so jump by 4, + using the fact that d != E, cd != EF, and bcd != EFG. + +But we can also jump by at least len(left_part) + 1: + + text: ....cdEF..... + pattern: abcdEF + Jump by len('abcd') + 1 = 5. + + Skip the alignments: + text: ....cdEF..... + pattern: abcdEF + text: ....cdEF..... + pattern: abcdEF + text: ....cdEF..... + pattern: abcdEF + text: ....cdEF..... + pattern: abcdEF + +This requires the following facts: + d != E + cd != EF + bcd != EF? + abcd != EF?? + etc., for all values of ?s, as above. + +If we have both sets of inequalities, then we can indeed jump by +max(len(left_part), len(right_part)) + 1. Under the assumption of such +a nice splitting of the needle, we now have enough to prove linear +time for the search: consider the forward-progress/comparisons ratio +at each alignment position. If a mismatch occurs in the right part, +the ratio is 1 position forward per comparison. On the other hand, +if a mismatch occurs in the left half, we advance by more than +len(needle)//2 positions for at most len(needle) comparisons, +so this ratio is more than 1/2. This average "movement speed" is +bounded below by the constant "1 position per 2 comparisons", so we +have linear time. + + +-------- The periodic case -------- + +The sets of inequalities listed so far seem too good to be true in +the general case. Indeed, they fail when a needle is periodic: +there's no way to split 'AAbAAbAAbA' in two such that + + (the stuff n characters to the left of the split) + cannot equal + (the stuff n characters to the right of the split) + for all n. + +This is because no matter how you cut it, you'll get +s[cut-3:cut] == s[cut:cut+3]. So what do we do? We still cut the +needle in two so that n can be as big as possible. If we were to +split it as + + AAbA + AbAAbA + +then A == A at the split, so this is bad (we failed at length 1), but +if we split it as + + AA + bAAbAAbA + +we at least have A != b and AA != bA, and we fail at length 3 +since ?AA == bAA. We already knew that a cut to make length-3 +mismatch was impossible due to the period, but we now see that the +bound is sharp; we can get length-1 and length-2 to mismatch. + +This is exactly the content of the *critical factorization theorem*: +that no matter the period of the original needle, you can cut it in +such a way that (with the appropriate question marks), +needle[cut-k:cut] mismatches needle[cut:cut+k] for all k < the period. + +Even "non-periodic" strings are periodic with a period equal to +their length, so for such needles, the CFT already guarantees that +the algorithm described so far will work, since we can cut the needle +so that the length-k chunks on either side of the cut mismatch for all +k < len(needle). Looking closer at the algorithm, we only actually +require that k go up to max(len(left_part), len(right_part)). +So long as the period exceeds that, we're good. + +The more general shorter-period case is a bit harder. The essentials +are the same, except we use the periodicity to our advantage by +"remembering" periods that we've already compared. In our running +example, say we're computing + + "AAbAAbAAbA" in "bbbAbbAAbAAbAAbbbAAbAAbAAbAA". + +We cut as AA + bAAbAAbA, and then the algorithm runs as follows: + + First alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^X + - Mismatch at third position, so jump by 3. + - This requires that A!=b and AA != bA. + + Second alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^^^^^^^ + X + - Matched entire right part + - Mismatch at left part. + - Jump forward a period, remembering the existing comparisons + + Third alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + mmmmmmm^^X + - There's "memory": a bunch of characters were already matched. + - Two more characters match beyond that. + - The 8th character of the right part mismatched, so jump by 8 + - The above rule is more complicated than usual: we don't have + the right inequalities for lengths 1 through 7, but we do have + shifted copies of the length-1 and length-2 inequalities, + along with knowledge of the mismatch. We can skip all of these + alignments at once: + + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~ A != b at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~~ AA != bA at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^^^X 7-3=4 match, and the 5th misses. + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~ A != b at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~~ AA != bA at the cut + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^X 7-3-3=1 match and the 2nd misses. + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ~ A != b at the cut + + Fourth alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^X + - Second character mismatches, so jump by 2. + + Fifth alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + ^^^^^^^^ + X + - Right half matches, so use memory and skip ahead by period=3 + + Sixth alignment: + bbbAbbAAbAAbAAbbbAAbAAbAAbAA + AAbAAbAAbA + mmmmmmmm^^ + - Right part matches, left part is remembered, found a match! + +The one tricky skip by 8 here generalizes: if we have a period of p, +then the CFT says we can ensure the cut has the inequality property +for lengths 1 through p-1, and jumping by p would line up the +matching characters and mismatched character one period earlier. +Inductively, this proves that we can skip by the number of characters +matched in the right half, plus 1, just as in the original algorithm. + +To make it explicit, the memory is set whenever the entire right part +is matched and is then used as a starting point in the next alignment. +In such a case, the alignment jumps forward one period, and the right +half matches all except possibly the last period. Additionally, +if we cut so that the left part has a length strictly less than the +period (we always can!), then we can know that the left part already +matches. The memory is reset to 0 whenever there is a mismatch in the +right part. + +To prove linearity for the periodic case, note that if a right-part +character mismatches, then we advance forward 1 unit per comparison. +On the other hand, if the entire right part matches, then the skipping +forward by one period "defers" some of the comparisons to the next +alignment, where they will then be spent at the usual rate of +one comparison per step forward. Even if left-half comparisons +are always "wasted", they constitute less than half of all +comparisons, so the average rate is certainly at least 1 move forward +per 2 comparisons. + + +-------- When to choose the periodic algorithm --------- + +The periodic algorithm is always valid but has an overhead of one +more "memory" register and some memory computation steps, so the +here-described-first non-periodic/long-period algorithm -- skipping by +max(len(left_part), len(right_part)) + 1 rather than the period -- +should be preferred when possible. + +Interestingly, the long-period algorithm does not require an exact +computation of the period; it works even with some long-period, but +undeniably "periodic" needles: + + Cut: AbcdefAbc == Abcde + fAbc + +This cut gives these inequalities: + + e != f + de != fA + cde != fAb + bcde != fAbc + Abcde != fAbc? + The first failure is a period long, per the CFT: + ?Abcde == fAbc?? + +A sufficient condition for using the long-period algorithm is having +the period of the needle be greater than +max(len(left_part), len(right_part)). This way, after choosing a good +split, we get all of the max(len(left_part), len(right_part)) +inequalities around the cut that were required in the long-period +version of the algorithm. + +With all of this in mind, here's how we choose: + + (1) Choose a "critical factorization" of the needle -- a cut + where we have period minus 1 inequalities in a row. + More specifically, choose a cut so that the left_part + is less than one period long. + (2) Determine the period P_r of the right_part. + (3) Check if the left part is just an extension of the pattern of + the right part, so that the whole needle has period P_r. + Explicitly, check if + needle[0:cut] == needle[0+P_r:cut+P_r] + If so, we use the periodic algorithm. If not equal, we use the + long-period algorithm. + +Note that if equality holds in (3), then the period of the whole +string is P_r. On the other hand, suppose equality does not hold. +The period of the needle is then strictly greater than P_r. Here's +a general fact: + + If p is a substring of s and p has period r, then the period + of s is either equal to r or greater than len(p). + +We know that needle_period != P_r, +and therefore needle_period > len(right_part). +Additionally, we'll choose the cut (see below) +so that len(left_part) < needle_period. + +Thus, in the case where equality does not hold, we have that +needle_period >= max(len(left_part), len(right_part)) + 1, +so the long-period algorithm works, but otherwise, we know the period +of the needle. + +Note that this decision process doesn't always require an exact +computation of the period -- we can get away with only computing P_r! + + +-------- Computing the cut -------- + +Our remaining tasks are now to compute a cut of the needle with as +many inequalities as possible, ensuring that cut < needle_period. +Meanwhile, we must also compute the period P_r of the right_part. + +The computation is relatively simple, essentially doing this: + + suffix1 = max(needle[i:] for i in range(len(needle))) + suffix2 = ... # the same as above, but invert the alphabet + cut1 = len(needle) - len(suffix1) + cut2 = len(needle) - len(suffix2) + cut = max(cut1, cut2) # the later cut + +For cut2, "invert the alphabet" is different than saying min(...), +since in lexicographic order, we still put "py" < "python", even +if the alphabet is inverted. Computing these, along with the method +of computing the period of the right half, is easiest to read directly +from the source code in fastsearch.h, in which these are computed +in linear time. + +Crochemore & Perrin's Theorem 3.1 give that "cut" above is a +critical factorization less than the period, but a very brief sketch +of their proof goes something like this (this is far from complete): + + * If this cut splits the needle as some + needle == (a + w) + (w + b), meaning there's a bad equality + w == w, it's impossible for w + b to be bigger than both + b and w + w + b, so this can't happen. We thus have all of + the ineuqalities with no question marks. + * By maximality, the right part is not a substring of the left + part. Thus, we have all of the inequalities involving no + left-side question marks. + * If you have all of the inequalities without right-side question + marks, we have a critical factorization. + * If one such inequality fails, then there's a smaller period, + but the factorization is nonetheless critical. Here's where + you need the redundancy coming from computing both cuts and + choosing the later one. + + +-------- Some more Bells and Whistles -------- + +Beyond Crochemore & Perrin's original algorithm, we can use a couple +more tricks for speed in fastsearch.h: + + 1. Even though C&P has a best-case O(n/m) time, this doesn't occur + very often, so we add a Boyer-Moore bad character table to + achieve sublinear time in more cases. + + 2. The prework of computing the cut/period is expensive per + needle character, so we shouldn't do it if it won't pay off. + For this reason, if the needle and haystack are long enough, + only automatically start with two-way if the needle's length + is a small percentage of the length of the haystack. + + 3. In cases where the needle and haystack are large but the needle + makes up a significant percentage of the length of the + haystack, don't pay the expensive two-way preprocessing cost + if you don't need to. Instead, keep track of how many + character comparisons are equal, and if that exceeds + O(len(needle)), then pay that cost, since the simpler algorithm + isn't doing very well. From e616bee3bc27420e713cf76f4343d80328c30d3e Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 17 Jan 2021 18:13:04 -0500 Subject: [PATCH 18/19] skip starts at mlast --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 50f732e74319c8..54a88b82f4cf19 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -553,7 +553,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } mlast = m - 1; - skip = mlast - 1; + skip = mlast; mask = 0; if (mode != FAST_RSEARCH) { From b63e7dc77f32e424a36e01ed229202574257cd1d Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 27 Feb 2021 01:22:15 -0500 Subject: [PATCH 19/19] Refactor to use for-loops, single cache-line table --- Lib/test/string_tests.py | 4 +- Objects/stringlib/fastsearch.h | 107 +++++++++--------- .../stringlib_find_two_way_notes.txt | 14 +-- 3 files changed, 60 insertions(+), 65 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 8b028d36d2328c..840d7bb7550f71 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -339,8 +339,8 @@ def reference_find(p, s): text, 'find', p) def test_find_shift_table_overflow(self): - """When the table of 16-bit shifts overflows.""" - N = 2**16 + 100 # Overflow the 16-bit shift table + """When the table of 8-bit shifts overflows.""" + N = 2**8 + 100 # first check the periodic case # here, the shift for 'b' is N + 1. diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 54a88b82f4cf19..6574720b609f4c 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -189,7 +189,7 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t period = 1; while (candidate + k < len_needle) { - // loop increases candidate + k by 1 at each step + // each loop increases candidate + k + max_suffix STRINGLIB_CHAR a = needle[candidate + k]; STRINGLIB_CHAR b = needle[max_suffix + k]; // check if the suffix at candidate is better than max_suffix @@ -286,11 +286,11 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, return cut; } -#define SHIFT_TYPE uint16_t +#define SHIFT_TYPE uint8_t #define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) #define SHIFT_OVERFLOW (NOT_FOUND - 1U) -#define TABLE_SIZE_BITS 7 +#define TABLE_SIZE_BITS 6 #define TABLE_SIZE (1U << TABLE_SIZE_BITS) #define TABLE_MASK (TABLE_SIZE - 1U) @@ -315,8 +315,14 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, p->is_periodic = (0 == memcmp(needle, needle + p->period, p->cut * STRINGLIB_SIZEOF_CHAR)); - assert(!p->is_periodic || (p->cut <= len_needle/2 - && p->cut < p->period)); + if (p->is_periodic) { + assert(p->cut <= len_needle/2); + assert(p->cut < p->period); + } + else { + // A lower bound on the period + p->period = Py_MAX(p->cut, len_needle - p->cut) + 1; + } // Now fill up a table memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); assert(p->table[0] == NOT_FOUND); @@ -344,11 +350,13 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle = p->needle; const STRINGLIB_CHAR *window = haystack; const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle; + SHIFT_TYPE *table = p->table; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); if (p->is_periodic) { LOG("Needle is periodic.\n"); Py_ssize_t memory = 0; + periodicwindowloop: while (window <= last_window) { Py_ssize_t i = Py_MAX(cut, memory); @@ -364,7 +372,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // as well jump to line up the character *after* the // current window. STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; + SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; if (shift == NOT_FOUND) { LOG("\"%c\" not found. Skipping entirely.\n", first_outside); @@ -376,42 +384,36 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, window += Py_MAX(shift, memory_shift); } memory = 0; - continue; + goto periodicwindowloop; } - - i++; - while (i < len_needle && needle[i] == window[i]) { - i++; - } - if (i >= len_needle) { - LOG("Right half matches.\n"); - i = cut - 1; - while (i >= memory && needle[i] == window[i]) { - i--; + for (i = i + 1; i < len_needle; i++) { + if (needle[i] != window[i]) { + LOG("Right half does not match. Jump ahead by %d.\n", + i - cut + 1); + window += i - cut + 1; + memory = 0; + goto periodicwindowloop; } - if (i < memory) { - LOG("Left half matches. Returning %d.\n", - window - haystack); - return window - haystack; - } - LOG("Left half does not match. Jump ahead by period %d.\n", - period); - window += period; - memory = len_needle - period; } - else { - LOG("Right half does not match. Jump ahead by %d.\n", - i - cut + 1); - window += i - cut + 1; - memory = 0; + for (i = memory; i < cut; i++) { + if (needle[i] != window[i]) { + LOG("Left half does not match. Jump ahead by period %d.\n", + period); + window += period; + memory = len_needle - period; + goto periodicwindowloop; + } } + LOG("Left half matches. Returning %d.\n", + window - haystack); + return window - haystack; } } else { - period = Py_MAX(cut, len_needle - cut) + 1; LOG("Needle is not periodic.\n"); assert(cut < len_needle); STRINGLIB_CHAR needle_cut = needle[cut]; + windowloop: while (window <= last_window) { // Visualize the line-up: @@ -426,7 +428,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // as well jump to line up the character *after* the // current window. STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = p->table[first_outside & TABLE_MASK]; + SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; if (shift == NOT_FOUND) { LOG("\"%c\" not found. Skipping entirely.\n", first_outside); @@ -436,33 +438,26 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("Shifting to line up \"%c\".\n", first_outside); window += shift; } - continue; - } - - Py_ssize_t i = cut + 1; - while (i < len_needle && needle[i] == window[i]) { - i++; + goto windowloop; } - if (i >= len_needle) { - LOG("Right half matches.\n"); - i = cut - 1; - while (i >= 0 && needle[i] == window[i]) { - i--; + for (Py_ssize_t i = cut + 1; i < len_needle; i++) { + if (needle[i] != window[i]) { + LOG("Right half does not match. Advance by %d.\n", + i - cut + 1); + window += i - cut + 1; + goto windowloop; } - if (i < 0){ - LOG("Left half matches. Returning %d.\n", - window - haystack); - return window - haystack; - } - LOG("Left half does not match. Advance by period %d.\n", - period); - window += period; } - else { - LOG("Right half does not match. Advance by %d.\n", - i - cut + 1); - window += i - cut + 1; + for (Py_ssize_t i = 0; i < cut; i++) { + if (needle[i] != window[i]) { + LOG("Left half does not match. Advance by period %d.\n", + period); + window += period; + goto windowloop; + } } + LOG("Left half matches. Returning %d.\n", window - haystack); + return window - haystack; } } LOG("Not found. Returning -1.\n"); diff --git a/Objects/stringlib/stringlib_find_two_way_notes.txt b/Objects/stringlib/stringlib_find_two_way_notes.txt index c944d16ca9c8fd..afe45431a75ac4 100644 --- a/Objects/stringlib/stringlib_find_two_way_notes.txt +++ b/Objects/stringlib/stringlib_find_two_way_notes.txt @@ -156,7 +156,7 @@ have linear time. The sets of inequalities listed so far seem too good to be true in the general case. Indeed, they fail when a needle is periodic: -there's no way to split 'AAbAAbAAbA' in two such that +there's no way to split 'AAbAAbAAbA' in two such that (the stuff n characters to the left of the split) cannot equal @@ -166,8 +166,8 @@ there's no way to split 'AAbAAbAAbA' in two such that This is because no matter how you cut it, you'll get s[cut-3:cut] == s[cut:cut+3]. So what do we do? We still cut the needle in two so that n can be as big as possible. If we were to -split it as - +split it as + AAbA + AbAAbA then A == A at the split, so this is bad (we failed at length 1), but @@ -194,9 +194,9 @@ require that k go up to max(len(left_part), len(right_part)). So long as the period exceeds that, we're good. The more general shorter-period case is a bit harder. The essentials -are the same, except we use the periodicity to our advantage by +are the same, except we use the periodicity to our advantage by "remembering" periods that we've already compared. In our running -example, say we're computing +example, say we're computing "AAbAAbAAbA" in "bbbAbbAAbAAbAAbbbAAbAAbAAbAA". @@ -233,7 +233,7 @@ We cut as AA + bAAbAAbA, and then the algorithm runs as follows: bbbAbbAAbAAbAAbbbAAbAAbAAbAA AAbAAbAAbA - ~ A != b at the cut + ~ A != b at the cut bbbAbbAAbAAbAAbbbAAbAAbAAbAA AAbAAbAAbA ~~ AA != bA at the cut @@ -421,7 +421,7 @@ more tricks for speed in fastsearch.h: For this reason, if the needle and haystack are long enough, only automatically start with two-way if the needle's length is a small percentage of the length of the haystack. - + 3. In cases where the needle and haystack are large but the needle makes up a significant percentage of the length of the haystack, don't pay the expensive two-way preprocessing cost