Skip to content

Commit d60e2da

Browse files
committed
initial commit
0 parents  commit d60e2da

File tree

6 files changed

+303
-0
lines changed

6 files changed

+303
-0
lines changed

MANIFEST

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
README
2+
setup.py
3+
stringmatching.py

README

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
## Python Algorithms Library
2+
## Laurent Luce
3+
4+
### Description
5+
The purpose of this library is to help you with common algorithms like:
6+
7+
String matching:
8+
- Naive
9+
- Rabin-Karp
10+
- Knuth-Morris-Pratt
11+
- Boyer-Moore-Horspool
12+
13+
### Installation
14+
Get the source and run
15+
16+
$ python setup.py install
17+
18+
### Usage
19+
To use the Python Algorithms library, just import the modules you need
20+
like 'import string_matching' in your current python application.
21+
22+
### License
23+
The Python Algorithms Library is distributed under the MIT License
+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import time
2+
import string_matching
3+
4+
class StringMatchingPerformance:
5+
6+
def __init__(self):
7+
pass
8+
9+
def calculate_performance(self):
10+
t = 'ababbababa'
11+
s = 'aba'
12+
times = 1000
13+
14+
ts = time.time()
15+
for i in range(times):
16+
string_matching.string_matching_naive(t, s)
17+
t1 = time.time() - ts
18+
print 'string_matching_naive: %.2f seconds' % t1
19+
20+
ts = time.time()
21+
for i in range(times):
22+
string_matching.string_matching_rabin_karp(t, s)
23+
t2 = time.time() - ts
24+
print 'string_matching_rabin_karp: %.2f seconds' % t2
25+
26+
ts = time.time()
27+
for i in range(times):
28+
string_matching.string_matching_knuth_morris_pratt(t, s)
29+
t2 = time.time() - ts
30+
print 'string_matching_knuth_morris_pratt: %.2f seconds' % t2
31+
32+
ts = time.time()
33+
for i in range(times):
34+
string_matching.string_matching_boyer_moore_horspool(t, s)
35+
t2 = time.time() - ts
36+
print 'string_matching_boyer_moore_horspool: %.2f seconds' % t2
37+
38+
if __name__ == '__main__':
39+
p = StringMatchingPerformance()
40+
p.calculate_performance()
41+

setup.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from distutils.core import setup
2+
setup(
3+
name = "pyalgorithms",
4+
py_modules = ['string_matching'],
5+
version = "0.1",
6+
description = "Python Algorithms",
7+
author = "Laurent Luce",
8+
author_email = "laurentluce49@yahoo.com",
9+
url = "http://github.com/laurentluce/pyalgorithms",
10+
download_url = "http://github.com/laurentluce/pyalgorithms",
11+
keywords = ["pyalgorithms","algorithms"],
12+
classifiers = [
13+
"Programming Language :: Python",
14+
"Operating System :: OS Independent",
15+
"License :: OSI Approved :: MIT License",
16+
"Intended Audience :: Developers",
17+
"Development Status :: 5 - Production/Stable",
18+
"Topic :: Software Development :: Libraries :: Python Modules"
19+
],
20+
long_description = """\
21+
Python Algorithms Library
22+
----------------------------
23+
24+
DESCRIPTION
25+
The purpose of this library is to help you with basic and more advanced
26+
algorithms
27+
28+
LICENSE The Python Algorithms Library is distributed under the MIT
29+
License """ )

string_matching.py

+167
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
"""
2+
Filename: string_matching.py
3+
"""
4+
5+
def string_matching_naive(text='', pattern=''):
6+
"""
7+
Returns positions where pattern is found in text
8+
9+
We slide the string to match 'pattern' over the text
10+
11+
O((n-m)m)
12+
Example: text = 'ababbababa', pattern = 'aba'
13+
string_matching_naive(t, s) returns [0, 5, 7]
14+
@param text text to search inside
15+
@param pattern string to search for
16+
@return list containing offsets (shifts) where pattern is found inside text
17+
"""
18+
19+
n = len(text)
20+
m = len(pattern)
21+
offsets = []
22+
for i in range(n-m+1):
23+
if pattern == text[i:i+m]:
24+
offsets.append(i)
25+
26+
return offsets
27+
28+
29+
def string_matching_rabin_karp(text='', pattern='', hash_base=256):
30+
"""
31+
Returns positions where pattern is found in text
32+
33+
We calculate the hash value of the pattern and we compare it to the hash
34+
value of text[i:i+m] for i = 0..n-m
35+
The nice thing is that we don't need to calculate the hash value of
36+
text[i:i+m] each time from scratch, we know that:
37+
h(text[i+1:i+m+1]) = (base * (h(text[i:i+m]) - (text[i] * (base ^ (m-1))))) + text[i+m]
38+
We can get h('bcd') from h('abc').
39+
h('bcd') = (base * (h('abc') - ('a' * (base ^ 2)))) + 'd'
40+
41+
worst case: O(nm)
42+
we can expect O(n+m) if the number of valid matches is small and the pattern
43+
large
44+
45+
Performance: ord() is slow so we shouldn't use it here
46+
47+
Example: text = 'ababbababa', pattern = 'aba'
48+
string_matching_rabin_karp(text, pattern) returns [0, 5, 7]
49+
@param text text to search inside
50+
@param pattern string to search for
51+
@param hash_base base to calculate the hash value
52+
@return list containing offsets (shifts) where pattern is found inside text
53+
"""
54+
55+
n = len(text)
56+
m = len(pattern)
57+
offsets = []
58+
htext = hash_value(text[:m], hash_base)
59+
hpattern = hash_value(pattern, hash_base)
60+
for i in range(n-m+1):
61+
if htext == hpattern:
62+
if text[i:i+m] == pattern:
63+
offsets.append(i)
64+
if i < n-m:
65+
htext = (hash_base * (htext - (ord(text[i]) * (hash_base ** (m-1))))) + ord(text[i+m])
66+
67+
return offsets
68+
69+
def hash_value(s, base):
70+
"""
71+
Calculate the hash value of a string using base
72+
73+
Example: 'abc' = 97 x base^2 + 98 x base^1 + 99 x base^0
74+
@param s string to compute hash value for
75+
@param base base to use to compute hash value
76+
@return hash value
77+
"""
78+
v = 0
79+
p = len(s)-1
80+
for i in range(p+1):
81+
v += ord(s[i]) * (base ** p)
82+
p -= 1
83+
84+
return v
85+
86+
def string_matching_knuth_morris_pratt(text='', pattern=''):
87+
"""
88+
Returns positions where pattern is found in text
89+
90+
See http://jboxer.com/2009/12/the-knuth-morris-pratt-algorithm-in-my-own-words/ for a great explanation on how this algorithm works.
91+
92+
O(m+n)
93+
Example: text = 'ababbababa', pattern = 'aba'
94+
string_matching_knuth_morris_pratt(text, pattern) returns [0, 5, 7]
95+
@param text text to search inside
96+
@param pattern string to search for
97+
@return list containing offsets (shifts) where pattern is found inside text
98+
"""
99+
100+
n = len(text)
101+
m = len(pattern)
102+
offsets = []
103+
pi = compute_prefix_function(pattern)
104+
q = 0
105+
for i in range(n):
106+
while q > 0 and pattern[q] != text[i]:
107+
q = pi[q - 1]
108+
if pattern[q] == text[i]:
109+
q = q + 1
110+
if q == m:
111+
offsets.append(i - m + 1)
112+
q = pi[q-1]
113+
114+
return offsets
115+
116+
def compute_prefix_function(p):
117+
m = len(p)
118+
pi = [0] * m
119+
k = 0
120+
for q in range(1, m):
121+
while k > 0 and p[k] != p[q]:
122+
k = pi[k - 1]
123+
if p[k] == p[q]:
124+
k = k + 1
125+
pi[q] = k
126+
return pi
127+
128+
def string_matching_boyer_moore_horspool(text='', pattern=''):
129+
"""
130+
Returns positions where pattern is found in text
131+
132+
See http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm for an explanation on how
133+
this algorithm works.
134+
135+
O(n)
136+
Performance: ord() is slow so we shouldn't use it here
137+
138+
Example: text = 'ababbababa', pattern = 'aba'
139+
string_matching_boyer_moore_horspool(text, pattern) returns [0, 5, 7]
140+
@param text text to search inside
141+
@param pattern string to search for
142+
@return list containing offsets (shifts) where pattern is found inside text
143+
"""
144+
145+
m = len(pattern)
146+
n = len(text)
147+
offsets = []
148+
if m > n:
149+
return offsets
150+
skip = []
151+
for k in range(256):
152+
skip.append(m)
153+
for k in range(m-1):
154+
skip[ord(pattern[k])] = m - k - 1
155+
skip = tuple(skip)
156+
k = m - 1
157+
while k < n:
158+
j = m - 1; i = k
159+
while j >= 0 and text[i] == pattern[j]:
160+
j -= 1
161+
i -= 1
162+
if j == -1:
163+
offsets.append(i + 1)
164+
k += skip[ord(text[k])]
165+
166+
return offsets
167+

tests/test_string_matching.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import unittest
2+
import string_matching
3+
4+
class StringMatchingTest(unittest.TestCase):
5+
6+
def test_string_matching_naive(self):
7+
t = 'ababbababa'
8+
s = 'aba'
9+
self.assertEquals(string_matching.string_matching_naive(t, s), [0, 5, 7])
10+
t = 'ababbababa'
11+
s = 'abbb'
12+
self.assertEquals(string_matching.string_matching_naive(t, s), [])
13+
14+
def test_string_matching_rabin_karp(self):
15+
t = 'ababbababa'
16+
s = 'aba'
17+
self.assertEquals(string_matching.string_matching_rabin_karp(t, s), [0, 5, 7])
18+
t = 'ababbababa'
19+
s = 'abbb'
20+
self.assertEquals(string_matching.string_matching_rabin_karp(t, s), [])
21+
22+
def test_string_matching_knuth_morris_pratt(self):
23+
t = 'ababbababa'
24+
s = 'aba'
25+
self.assertEquals(string_matching.string_matching_knuth_morris_pratt(t, s), [0, 5, 7])
26+
t = 'ababbababa'
27+
s = 'abbb'
28+
self.assertEquals(string_matching.string_matching_knuth_morris_pratt(t, s), [])
29+
30+
def test_string_matching_boyer_moore_horspool(self):
31+
t = 'ababbababa'
32+
s = 'aba'
33+
self.assertEquals(string_matching.string_matching_boyer_moore_horspool(t, s), [0, 5, 7])
34+
t = 'ababbababa'
35+
s = 'abbb'
36+
self.assertEquals(string_matching.string_matching_boyer_moore_horspool(t, s), [])
37+
38+
if __name__ == '__main__':
39+
unittest.main()
40+

0 commit comments

Comments
 (0)