|
| 1 | +""" |
| 2 | +Filename: string_matching.py |
| 3 | +""" |
| 4 | + |
| 5 | +def string_matching_naive(text='', pattern=''): |
| 6 | + """ |
| 7 | + Returns positions where pattern is found in text |
| 8 | +
|
| 9 | + We slide the string to match 'pattern' over the text |
| 10 | +
|
| 11 | + O((n-m)m) |
| 12 | + Example: text = 'ababbababa', pattern = 'aba' |
| 13 | + string_matching_naive(t, s) returns [0, 5, 7] |
| 14 | + @param text text to search inside |
| 15 | + @param pattern string to search for |
| 16 | + @return list containing offsets (shifts) where pattern is found inside text |
| 17 | + """ |
| 18 | + |
| 19 | + n = len(text) |
| 20 | + m = len(pattern) |
| 21 | + offsets = [] |
| 22 | + for i in range(n-m+1): |
| 23 | + if pattern == text[i:i+m]: |
| 24 | + offsets.append(i) |
| 25 | + |
| 26 | + return offsets |
| 27 | + |
| 28 | + |
| 29 | +def string_matching_rabin_karp(text='', pattern='', hash_base=256): |
| 30 | + """ |
| 31 | + Returns positions where pattern is found in text |
| 32 | +
|
| 33 | + We calculate the hash value of the pattern and we compare it to the hash |
| 34 | + value of text[i:i+m] for i = 0..n-m |
| 35 | + The nice thing is that we don't need to calculate the hash value of |
| 36 | + text[i:i+m] each time from scratch, we know that: |
| 37 | + h(text[i+1:i+m+1]) = (base * (h(text[i:i+m]) - (text[i] * (base ^ (m-1))))) + text[i+m] |
| 38 | + We can get h('bcd') from h('abc'). |
| 39 | + h('bcd') = (base * (h('abc') - ('a' * (base ^ 2)))) + 'd' |
| 40 | + |
| 41 | + worst case: O(nm) |
| 42 | + we can expect O(n+m) if the number of valid matches is small and the pattern |
| 43 | + large |
| 44 | + |
| 45 | + Performance: ord() is slow so we shouldn't use it here |
| 46 | +
|
| 47 | + Example: text = 'ababbababa', pattern = 'aba' |
| 48 | + string_matching_rabin_karp(text, pattern) returns [0, 5, 7] |
| 49 | + @param text text to search inside |
| 50 | + @param pattern string to search for |
| 51 | + @param hash_base base to calculate the hash value |
| 52 | + @return list containing offsets (shifts) where pattern is found inside text |
| 53 | + """ |
| 54 | + |
| 55 | + n = len(text) |
| 56 | + m = len(pattern) |
| 57 | + offsets = [] |
| 58 | + htext = hash_value(text[:m], hash_base) |
| 59 | + hpattern = hash_value(pattern, hash_base) |
| 60 | + for i in range(n-m+1): |
| 61 | + if htext == hpattern: |
| 62 | + if text[i:i+m] == pattern: |
| 63 | + offsets.append(i) |
| 64 | + if i < n-m: |
| 65 | + htext = (hash_base * (htext - (ord(text[i]) * (hash_base ** (m-1))))) + ord(text[i+m]) |
| 66 | + |
| 67 | + return offsets |
| 68 | + |
| 69 | +def hash_value(s, base): |
| 70 | + """ |
| 71 | + Calculate the hash value of a string using base |
| 72 | +
|
| 73 | + Example: 'abc' = 97 x base^2 + 98 x base^1 + 99 x base^0 |
| 74 | + @param s string to compute hash value for |
| 75 | + @param base base to use to compute hash value |
| 76 | + @return hash value |
| 77 | + """ |
| 78 | + v = 0 |
| 79 | + p = len(s)-1 |
| 80 | + for i in range(p+1): |
| 81 | + v += ord(s[i]) * (base ** p) |
| 82 | + p -= 1 |
| 83 | + |
| 84 | + return v |
| 85 | + |
| 86 | +def string_matching_knuth_morris_pratt(text='', pattern=''): |
| 87 | + """ |
| 88 | + Returns positions where pattern is found in text |
| 89 | +
|
| 90 | + See http://jboxer.com/2009/12/the-knuth-morris-pratt-algorithm-in-my-own-words/ for a great explanation on how this algorithm works. |
| 91 | + |
| 92 | + O(m+n) |
| 93 | + Example: text = 'ababbababa', pattern = 'aba' |
| 94 | + string_matching_knuth_morris_pratt(text, pattern) returns [0, 5, 7] |
| 95 | + @param text text to search inside |
| 96 | + @param pattern string to search for |
| 97 | + @return list containing offsets (shifts) where pattern is found inside text |
| 98 | + """ |
| 99 | + |
| 100 | + n = len(text) |
| 101 | + m = len(pattern) |
| 102 | + offsets = [] |
| 103 | + pi = compute_prefix_function(pattern) |
| 104 | + q = 0 |
| 105 | + for i in range(n): |
| 106 | + while q > 0 and pattern[q] != text[i]: |
| 107 | + q = pi[q - 1] |
| 108 | + if pattern[q] == text[i]: |
| 109 | + q = q + 1 |
| 110 | + if q == m: |
| 111 | + offsets.append(i - m + 1) |
| 112 | + q = pi[q-1] |
| 113 | + |
| 114 | + return offsets |
| 115 | + |
| 116 | +def compute_prefix_function(p): |
| 117 | + m = len(p) |
| 118 | + pi = [0] * m |
| 119 | + k = 0 |
| 120 | + for q in range(1, m): |
| 121 | + while k > 0 and p[k] != p[q]: |
| 122 | + k = pi[k - 1] |
| 123 | + if p[k] == p[q]: |
| 124 | + k = k + 1 |
| 125 | + pi[q] = k |
| 126 | + return pi |
| 127 | + |
| 128 | +def string_matching_boyer_moore_horspool(text='', pattern=''): |
| 129 | + """ |
| 130 | + Returns positions where pattern is found in text |
| 131 | +
|
| 132 | + See http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm for an explanation on how |
| 133 | + this algorithm works. |
| 134 | + |
| 135 | + O(n) |
| 136 | + Performance: ord() is slow so we shouldn't use it here |
| 137 | +
|
| 138 | + Example: text = 'ababbababa', pattern = 'aba' |
| 139 | + string_matching_boyer_moore_horspool(text, pattern) returns [0, 5, 7] |
| 140 | + @param text text to search inside |
| 141 | + @param pattern string to search for |
| 142 | + @return list containing offsets (shifts) where pattern is found inside text |
| 143 | + """ |
| 144 | + |
| 145 | + m = len(pattern) |
| 146 | + n = len(text) |
| 147 | + offsets = [] |
| 148 | + if m > n: |
| 149 | + return offsets |
| 150 | + skip = [] |
| 151 | + for k in range(256): |
| 152 | + skip.append(m) |
| 153 | + for k in range(m-1): |
| 154 | + skip[ord(pattern[k])] = m - k - 1 |
| 155 | + skip = tuple(skip) |
| 156 | + k = m - 1 |
| 157 | + while k < n: |
| 158 | + j = m - 1; i = k |
| 159 | + while j >= 0 and text[i] == pattern[j]: |
| 160 | + j -= 1 |
| 161 | + i -= 1 |
| 162 | + if j == -1: |
| 163 | + offsets.append(i + 1) |
| 164 | + k += skip[ord(text[k])] |
| 165 | + |
| 166 | + return offsets |
| 167 | + |
0 commit comments