Initial testcases for encoding detection

jgraham · jgraham · commit a0eefb178521 · 2007-02-23T00:26:05.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40506
diff --git a/src/inputstream.py b/src/inputstream.py
@@ -37,7 +37,7 @@ def __init__(self, source, encoding=None):
         #encoding information
         self.numBytesMeta = 512
         #Encoding to use if no other information can be found
-        self.defaultEncoding = "cp1252"
+        self.defaultEncoding = "windows-1252"
         #Detect encoding iff no explicit "transport level" encoding is supplied
         if encoding is None:
             encoding = self.detectEncoding()
@@ -46,7 +46,7 @@ def __init__(self, source, encoding=None):
         # Read bytes from stream decoding them into Unicode
         uString = self.rawStream.read().decode(self.charEncoding, 'replace')
 
-        # Normalize new lines and null characters
+        # Normalize new ipythonlines and null characters
         uString = re.sub('\r\n?', '\n', uString)
         uString = re.sub('\x00', '\xFFFD', uString)
 
@@ -78,24 +78,31 @@ def detectEncoding(self):
         #First look for a BOM
         #This will also read past the BOM if present
         encoding = self.detectBOM()
-        if encoding is not None:
-            return encoding
 
         #If there is no BOM need to look for meta elements with encoding 
         #information
-        encoding = self.detectEncodingMeta()
-        if encoding is not None:
-            return encoding
+        if encoding is None:
+            encoding = self.detectEncodingMeta()
 
         #Guess with chardet, if avaliable
-        try:
-            import chardet
-            return chardet.detect(self.rawStream)['encoding']
-        except ImportError:
-            pass
+        if encoding is None:
+            try:
+                import chardet
+                encoding = chardet.detect(self.rawStream)['encoding']
+            except ImportError:
+                pass
 
         # If all else fails use the default encoding
-        return self.defaultEncoding
+        if encoding is None:
+            encoding = self.defaultEncoding
+        
+        #Substitute for equivalent encodings:
+        encodingSub = {"iso-8859-1":"windows-1252"}
+
+        if encoding.lower() in encodingSub:
+            encoding = encodingSub[encoding.lower()]
+        
+        return encoding
 
     def detectBOM(self):
         """Attempts to detect at BOM at the start of the stream. If
@@ -354,7 +361,7 @@ def isValidEncoding(self, encoding):
         try:
             codecs.lookup(encoding)
             rv = True
-        except codecs.lookup_error:
+        except LookupError:
             rv = False
         return rv
 
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -0,0 +1,73 @@
+import sys
+import os
+import glob
+import StringIO
+import unittest
+import new
+
+# XXX Allow us to import the sibling module
+os.chdir(os.path.split(os.path.abspath(__file__))[0])
+sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
+
+import inputstream
+import codecs
+
+def parseTestcase(testString):
+    testString = testString.split("\n")
+    try:
+        if testString[0] != "#data":
+            sys.stderr.write(testString)
+        assert testString[0] == "#data"
+    except:
+        raise
+    input = []
+    encoding = []
+    currentList = input
+    for line in testString:
+        if line and not (line.startswith("#encoding") or
+                         line.startswith("#data")):
+            currentList.append(line)
+        elif line.startswith("#encoding"):
+            currentList = encoding
+    return "\n".join(input), encoding[0]
+
+class TestCase(unittest.TestCase):
+    def runEncodingTest(self, input, encoding):
+        #XXX - move this out into the setup function
+        #concatenate all consecutive character tokens into a single token
+        stream = inputstream.HTMLInputStream(input)
+        
+        errorMsg = "\n".join(["\n\nInput", input,"\nExpected:", encoding,
+                              "\nRecieved:", stream.charEncoding])
+        self.assertEquals(encoding.lower(), stream.charEncoding.lower(),
+                          errorMsg)
+
+def test_encoding():
+    for filename in glob.glob('encoding/*.dat'):
+        f = open(filename)
+        tests = f.read().split("#data\n")
+        for test in tests:
+            if test == "":
+                continue
+            test = "#data\n" + test
+            input, encoding = parseTestcase(test)
+            yield TestCase.runEncodingTest, input, encoding
+
+def buildTestSuite():
+    tests = 0
+    for func, input, encoding in test_encoding():
+        tests += 1
+        testName = 'test%d' % tests
+        testFunc = lambda self, method=func, input=input, encoding=encoding, \
+            : method(self, input, encoding)
+        testFunc.__doc__ = 'Encoding %s'%(testName)
+        instanceMethod = new.instancemethod(testFunc, None, TestCase)
+        setattr(TestCase, testName, instanceMethod)
+    return unittest.TestLoader().loadTestsFromTestCase(TestCase)
+
+def main():   
+    buildTestSuite()
+    unittest.main()
+
+if __name__ == "__main__":
+    main()