initial commit

laurentluce · laurentluce · commit d60e2dace3b6 · 2010-11-22T17:24:58.000-08:00
diff --git a/MANIFEST b/MANIFEST
@@ -0,0 +1,3 @@
+README
+setup.py
+stringmatching.py
diff --git a/README b/README
@@ -0,0 +1,23 @@
+## Python Algorithms Library
+## Laurent Luce
+
+### Description
+The purpose of this library is to help you with common algorithms like:
+
+String matching:
+  - Naive
+  - Rabin-Karp
+  - Knuth-Morris-Pratt
+  - Boyer-Moore-Horspool
+
+### Installation
+Get the source and run
+
+    $ python setup.py install
+
+### Usage
+To use the Python Algorithms library, just import the modules you need
+like 'import string_matching' in your current python application.
+
+### License
+The Python Algorithms Library is distributed under the MIT License
diff --git a/performance/performance_string_matching.py b/performance/performance_string_matching.py
@@ -0,0 +1,41 @@
+import time
+import string_matching
+
+class StringMatchingPerformance:
+   
+  def __init__(self):
+    pass
+
+  def calculate_performance(self):
+    t = 'ababbababa'
+    s = 'aba'
+    times = 1000
+    
+    ts = time.time()
+    for i in range(times):
+      string_matching.string_matching_naive(t, s)
+    t1 = time.time() - ts
+    print 'string_matching_naive: %.2f seconds' % t1
+
+    ts = time.time()
+    for i in range(times):
+      string_matching.string_matching_rabin_karp(t, s)
+    t2 = time.time() - ts
+    print 'string_matching_rabin_karp: %.2f seconds' % t2
+
+    ts = time.time()
+    for i in range(times):
+      string_matching.string_matching_knuth_morris_pratt(t, s)
+    t2 = time.time() - ts
+    print 'string_matching_knuth_morris_pratt: %.2f seconds' % t2
+
+    ts = time.time()
+    for i in range(times):
+      string_matching.string_matching_boyer_moore_horspool(t, s)
+    t2 = time.time() - ts
+    print 'string_matching_boyer_moore_horspool: %.2f seconds' % t2
+
+if __name__ == '__main__':
+  p = StringMatchingPerformance()
+  p.calculate_performance()
+
diff --git a/setup.py b/setup.py
@@ -0,0 +1,29 @@
+from distutils.core import setup
+setup(
+    name = "pyalgorithms",
+    py_modules = ['string_matching'],
+    version = "0.1",
+    description = "Python Algorithms",
+    author = "Laurent Luce",
+    author_email = "laurentluce49@yahoo.com",
+    url = "http://github.com/laurentluce/pyalgorithms",
+    download_url = "http://github.com/laurentluce/pyalgorithms",
+    keywords = ["pyalgorithms","algorithms"],
+    classifiers = [
+        "Programming Language :: Python",
+        "Operating System :: OS Independent",
+        "License :: OSI Approved :: MIT License",
+        "Intended Audience :: Developers",
+        "Development Status :: 5 - Production/Stable",
+        "Topic :: Software Development :: Libraries :: Python Modules"
+        ],
+    long_description = """\
+    Python Algorithms Library
+    ----------------------------
+    
+    DESCRIPTION
+    The purpose of this library is to help you with basic and more advanced
+    algorithms
+    
+    LICENSE The Python Algorithms Library is distributed under the MIT
+    License """ )
diff --git a/string_matching.py b/string_matching.py
@@ -0,0 +1,167 @@
+"""
+Filename: string_matching.py
+"""
+
+def string_matching_naive(text='', pattern=''):
+  """
+  Returns positions where pattern is found in text
+
+  We slide the string to match 'pattern' over the text
+
+  O((n-m)m)
+  Example: text = 'ababbababa', pattern = 'aba'
+           string_matching_naive(t, s) returns [0, 5, 7]
+  @param text text to search inside
+  @param pattern string to search for
+  @return list containing offsets (shifts) where pattern is found inside text
+  """
+
+  n = len(text)
+  m = len(pattern)
+  offsets = []
+  for i in range(n-m+1):
+    if pattern == text[i:i+m]:
+      offsets.append(i)
+
+  return offsets
+
+
+def string_matching_rabin_karp(text='', pattern='', hash_base=256):
+  """
+  Returns positions where pattern is found in text
+
+  We calculate the hash value of the pattern and we compare it to the hash
+  value of text[i:i+m] for i = 0..n-m
+  The nice thing is that we don't need to calculate the hash value of
+  text[i:i+m] each time from scratch, we know that:
+  h(text[i+1:i+m+1]) = (base * (h(text[i:i+m]) - (text[i] * (base ^ (m-1))))) + text[i+m]
+  We can get h('bcd') from h('abc').
+  h('bcd') = (base * (h('abc') - ('a' * (base ^ 2)))) + 'd'
+  
+  worst case: O(nm)
+  we can expect O(n+m) if the number of valid matches is small and the pattern
+  large
+  
+  Performance: ord() is slow so we shouldn't use it here
+
+  Example: text = 'ababbababa', pattern = 'aba'
+           string_matching_rabin_karp(text, pattern) returns [0, 5, 7]
+  @param text text to search inside
+  @param pattern string to search for
+  @param hash_base base to calculate the hash value 
+  @return list containing offsets (shifts) where pattern is found inside text
+  """
+
+  n = len(text)
+  m = len(pattern)
+  offsets = []
+  htext = hash_value(text[:m], hash_base)
+  hpattern = hash_value(pattern, hash_base)
+  for i in range(n-m+1):
+    if htext == hpattern:
+      if text[i:i+m] == pattern: 
+        offsets.append(i)
+    if i < n-m:
+      htext = (hash_base * (htext - (ord(text[i]) * (hash_base ** (m-1))))) + ord(text[i+m])
+
+  return offsets
+
+def hash_value(s, base):
+  """
+  Calculate the hash value of a string using base
+
+  Example: 'abc' = 97 x base^2 + 98 x base^1 + 99 x base^0
+  @param s string to compute hash value for
+  @param base base to use to compute hash value
+  @return hash value
+  """
+  v = 0
+  p = len(s)-1
+  for i in range(p+1):
+    v += ord(s[i]) * (base ** p)
+    p -= 1
+
+  return v
+
+def string_matching_knuth_morris_pratt(text='', pattern=''):
+  """
+  Returns positions where pattern is found in text
+
+  See http://jboxer.com/2009/12/the-knuth-morris-pratt-algorithm-in-my-own-words/ for a great explanation on how this algorithm works.
+  
+  O(m+n)
+  Example: text = 'ababbababa', pattern = 'aba'
+           string_matching_knuth_morris_pratt(text, pattern) returns [0, 5, 7]
+  @param text text to search inside
+  @param pattern string to search for
+  @return list containing offsets (shifts) where pattern is found inside text
+  """
+
+  n = len(text)
+  m = len(pattern)
+  offsets = []
+  pi = compute_prefix_function(pattern)
+  q = 0
+  for i in range(n):
+    while q > 0 and pattern[q] != text[i]:
+      q = pi[q - 1]
+    if pattern[q] == text[i]:
+      q = q + 1
+    if q == m:
+      offsets.append(i - m + 1)
+      q = pi[q-1]
+
+  return offsets
+
+def compute_prefix_function(p):
+  m = len(p)
+  pi = [0] * m
+  k = 0
+  for q in range(1, m):
+    while k > 0 and p[k] != p[q]:
+      k = pi[k - 1]
+    if p[k] == p[q]:
+      k = k + 1
+    pi[q] = k
+  return pi
+
+def string_matching_boyer_moore_horspool(text='', pattern=''):
+  """
+  Returns positions where pattern is found in text
+
+  See http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm for an explanation on how 
+  this algorithm works.
+  
+  O(n)
+  Performance: ord() is slow so we shouldn't use it here
+
+  Example: text = 'ababbababa', pattern = 'aba'
+           string_matching_boyer_moore_horspool(text, pattern) returns [0, 5, 7]
+  @param text text to search inside
+  @param pattern string to search for
+  @return list containing offsets (shifts) where pattern is found inside text
+  """
+  
+  m = len(pattern)
+  n = len(text)
+  offsets = []
+  if m > n:
+    return offsets
+  skip = []
+  for k in range(256):
+    skip.append(m)
+  for k in range(m-1):
+    skip[ord(pattern[k])] = m - k - 1
+  skip = tuple(skip)
+  k = m - 1
+  while k < n:
+    j = m - 1; i = k
+    while j >= 0 and text[i] == pattern[j]:
+      j -= 1
+      i -= 1
+    if j == -1:
+      offsets.append(i + 1)
+    k += skip[ord(text[k])]
+
+  return offsets
+
diff --git a/tests/test_string_matching.py b/tests/test_string_matching.py
@@ -0,0 +1,40 @@
+import unittest
+import string_matching
+
+class StringMatchingTest(unittest.TestCase):
+    
+  def test_string_matching_naive(self):
+        t = 'ababbababa'
+        s = 'aba'
+        self.assertEquals(string_matching.string_matching_naive(t, s), [0, 5, 7])
+        t = 'ababbababa'
+        s = 'abbb'
+        self.assertEquals(string_matching.string_matching_naive(t, s), [])
+
+  def test_string_matching_rabin_karp(self):
+        t = 'ababbababa'
+        s = 'aba'
+        self.assertEquals(string_matching.string_matching_rabin_karp(t, s), [0, 5, 7])
+        t = 'ababbababa'
+        s = 'abbb'
+        self.assertEquals(string_matching.string_matching_rabin_karp(t, s), [])
+
+  def test_string_matching_knuth_morris_pratt(self):
+        t = 'ababbababa'
+        s = 'aba'
+        self.assertEquals(string_matching.string_matching_knuth_morris_pratt(t, s), [0, 5, 7])
+        t = 'ababbababa'
+        s = 'abbb'
+        self.assertEquals(string_matching.string_matching_knuth_morris_pratt(t, s), [])
+
+  def test_string_matching_boyer_moore_horspool(self):
+        t = 'ababbababa'
+        s = 'aba'
+        self.assertEquals(string_matching.string_matching_boyer_moore_horspool(t, s), [0, 5, 7])
+        t = 'ababbababa'
+        s = 'abbb'
+        self.assertEquals(string_matching.string_matching_boyer_moore_horspool(t, s), [])
+
+if __name__ == '__main__':
+    unittest.main()
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+README`
	`2`	`+setup.py`
	`3`	`+stringmatching.py`