Skip to content

Commit a0eefb1

Browse files
committed
Initial testcases for encoding detection
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40506
1 parent 85d6960 commit a0eefb1

File tree

2 files changed

+94
-14
lines changed

2 files changed

+94
-14
lines changed

src/inputstream.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self, source, encoding=None):
3737
#encoding information
3838
self.numBytesMeta = 512
3939
#Encoding to use if no other information can be found
40-
self.defaultEncoding = "cp1252"
40+
self.defaultEncoding = "windows-1252"
4141
#Detect encoding iff no explicit "transport level" encoding is supplied
4242
if encoding is None:
4343
encoding = self.detectEncoding()
@@ -46,7 +46,7 @@ def __init__(self, source, encoding=None):
4646
# Read bytes from stream decoding them into Unicode
4747
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
4848

49-
# Normalize new lines and null characters
49+
# Normalize new ipythonlines and null characters
5050
uString = re.sub('\r\n?', '\n', uString)
5151
uString = re.sub('\x00', '\xFFFD', uString)
5252

@@ -78,24 +78,31 @@ def detectEncoding(self):
7878
#First look for a BOM
7979
#This will also read past the BOM if present
8080
encoding = self.detectBOM()
81-
if encoding is not None:
82-
return encoding
8381

8482
#If there is no BOM need to look for meta elements with encoding
8583
#information
86-
encoding = self.detectEncodingMeta()
87-
if encoding is not None:
88-
return encoding
84+
if encoding is None:
85+
encoding = self.detectEncodingMeta()
8986

9087
#Guess with chardet, if avaliable
91-
try:
92-
import chardet
93-
return chardet.detect(self.rawStream)['encoding']
94-
except ImportError:
95-
pass
88+
if encoding is None:
89+
try:
90+
import chardet
91+
encoding = chardet.detect(self.rawStream)['encoding']
92+
except ImportError:
93+
pass
9694

9795
# If all else fails use the default encoding
98-
return self.defaultEncoding
96+
if encoding is None:
97+
encoding = self.defaultEncoding
98+
99+
#Substitute for equivalent encodings:
100+
encodingSub = {"iso-8859-1":"windows-1252"}
101+
102+
if encoding.lower() in encodingSub:
103+
encoding = encodingSub[encoding.lower()]
104+
105+
return encoding
99106

100107
def detectBOM(self):
101108
"""Attempts to detect at BOM at the start of the stream. If
@@ -354,7 +361,7 @@ def isValidEncoding(self, encoding):
354361
try:
355362
codecs.lookup(encoding)
356363
rv = True
357-
except codecs.lookup_error:
364+
except LookupError:
358365
rv = False
359366
return rv
360367

tests/test_encoding.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import sys
2+
import os
3+
import glob
4+
import StringIO
5+
import unittest
6+
import new
7+
8+
# XXX Allow us to import the sibling module
9+
os.chdir(os.path.split(os.path.abspath(__file__))[0])
10+
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
11+
12+
import inputstream
13+
import codecs
14+
15+
def parseTestcase(testString):
16+
testString = testString.split("\n")
17+
try:
18+
if testString[0] != "#data":
19+
sys.stderr.write(testString)
20+
assert testString[0] == "#data"
21+
except:
22+
raise
23+
input = []
24+
encoding = []
25+
currentList = input
26+
for line in testString:
27+
if line and not (line.startswith("#encoding") or
28+
line.startswith("#data")):
29+
currentList.append(line)
30+
elif line.startswith("#encoding"):
31+
currentList = encoding
32+
return "\n".join(input), encoding[0]
33+
34+
class TestCase(unittest.TestCase):
35+
def runEncodingTest(self, input, encoding):
36+
#XXX - move this out into the setup function
37+
#concatenate all consecutive character tokens into a single token
38+
stream = inputstream.HTMLInputStream(input)
39+
40+
errorMsg = "\n".join(["\n\nInput", input,"\nExpected:", encoding,
41+
"\nRecieved:", stream.charEncoding])
42+
self.assertEquals(encoding.lower(), stream.charEncoding.lower(),
43+
errorMsg)
44+
45+
def test_encoding():
46+
for filename in glob.glob('encoding/*.dat'):
47+
f = open(filename)
48+
tests = f.read().split("#data\n")
49+
for test in tests:
50+
if test == "":
51+
continue
52+
test = "#data\n" + test
53+
input, encoding = parseTestcase(test)
54+
yield TestCase.runEncodingTest, input, encoding
55+
56+
def buildTestSuite():
57+
tests = 0
58+
for func, input, encoding in test_encoding():
59+
tests += 1
60+
testName = 'test%d' % tests
61+
testFunc = lambda self, method=func, input=input, encoding=encoding, \
62+
: method(self, input, encoding)
63+
testFunc.__doc__ = 'Encoding %s'%(testName)
64+
instanceMethod = new.instancemethod(testFunc, None, TestCase)
65+
setattr(TestCase, testName, instanceMethod)
66+
return unittest.TestLoader().loadTestsFromTestCase(TestCase)
67+
68+
def main():
69+
buildTestSuite()
70+
unittest.main()
71+
72+
if __name__ == "__main__":
73+
main()

0 commit comments

Comments
 (0)