Skip to content

Commit d05f439

Browse files
committed
Several changes related to character encoding; convert utf-16 to utf-8 if found in pre-parse algorithm, allow chardet to be switched off, start implementing reparsing if <meta> found during actual parse (not yet complete)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401056
1 parent 84313a8 commit d05f439

File tree

6 files changed

+36
-24
lines changed

6 files changed

+36
-24
lines changed

src/html5lib/html5parser.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,15 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7878
}
7979

8080
def _parse(self, stream, innerHTML=False, container="div",
81-
encoding=None, **kwargs):
81+
encoding=None, parseMeta=True, useChardet=True, **kwargs):
8282

8383
self.tree.reset()
8484
self.firstStartTag = False
8585
self.errors = []
8686

8787
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
88-
parseMeta=not innerHTML, **kwargs)
88+
parseMeta=parseMeta,
89+
useChardet=useChardet, **kwargs)
8990

9091
if innerHTML:
9192
self.innerHTML = container.lower()
@@ -131,7 +132,7 @@ def _parse(self, stream, innerHTML=False, container="div",
131132
# When the loop finishes it's EOF
132133
self.phase.processEOF()
133134

134-
def parse(self, stream, encoding=None):
135+
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
135136
"""Parse a HTML document into a well-formed tree
136137
137138
stream - a filelike object or string containing the HTML to be parsed
@@ -144,7 +145,8 @@ def parse(self, stream, encoding=None):
144145
self._parse(stream, innerHTML=False, encoding=encoding)
145146
return self.tree.getDocument()
146147

147-
def parseFragment(self, stream, container="div", encoding=None):
148+
def parseFragment(self, stream, container="div", encoding=None,
149+
parseMeta=False, useChardet=True):
148150
"""Parse a HTML fragment into a well-formed tree fragment
149151
150152
container - name of the element we're setting the innerHTML property

src/html5lib/inputstream.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
3838
# List of where new lines occur
3939
self.newLines = [0]
4040

41-
self.charEncoding = encoding
41+
self.charEncoding = (encoding, "certian")
4242

4343
# Raw Stream - for unicode objects this will encode to utf-8 and set
4444
# self.charEncoding as appropriate
@@ -54,11 +54,11 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5454
self.defaultEncoding = "windows-1252"
5555

5656
#Detect encoding iff no explicit "transport level" encoding is supplied
57-
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
57+
if self.charEncoding[0] is None or not isValidEncoding(self.charEncoding[0]):
5858
self.charEncoding = self.detectEncoding(parseMeta, chardet)
5959

60-
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
61-
'replace')
60+
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
61+
' replace')
6262

6363
self.queue = deque([])
6464
self.readChars = []
@@ -92,12 +92,15 @@ def detectEncoding(self, parseMeta=True, chardet=True):
9292
#First look for a BOM
9393
#This will also read past the BOM if present
9494
encoding = self.detectBOM()
95+
confidence = "certain"
9596
#If there is no BOM need to look for meta elements with encoding
9697
#information
9798
if encoding is None and parseMeta:
9899
encoding = self.detectEncodingMeta()
100+
confidence = "tentative"
99101
#Guess with chardet, if avaliable
100102
if encoding is None and chardet:
103+
confidence = "tentative"
101104
try:
102105
from chardet.universaldetector import UniversalDetector
103106
buffers = []
@@ -115,6 +118,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
115118
pass
116119
# If all else fails use the default encoding
117120
if encoding is None:
121+
confidence="tentative"
118122
encoding = self.defaultEncoding
119123

120124
#Substitute for equivalent encodings:
@@ -123,7 +127,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
123127
if encoding.lower() in encodingSub:
124128
encoding = encodingSub[encoding.lower()]
125129

126-
return encoding
130+
return encoding, confidence
127131

128132
def detectBOM(self):
129133
"""Attempts to detect at BOM at the start of the stream. If
@@ -200,7 +204,8 @@ def detectEncodingMeta(self):
200204
buffer = self.rawStream.read(self.numBytesMeta)
201205
parser = EncodingParser(buffer)
202206
self.seek(buffer, 0)
203-
return parser.getEncoding()
207+
encoding = parser.getEncoding()
208+
return encoding
204209

205210
def updatePosition(self):
206211
#Remove EOF from readChars, if present
@@ -414,7 +419,12 @@ def getEncoding(self):
414419
if not keepParsing:
415420
break
416421
if self.encoding is not None:
417-
self.encoding = self.encoding.strip()
422+
self.encoding = self.encoding.strip()
423+
#Spec violation that complies with hsivonen + mjs
424+
if self.encoding.upper() in ("UTF-16", "UTF-16BE", "UTF-16LE",
425+
"UTF-32", "UTF-32BE", "UTF-32LE"):
426+
self.encoding = "utf-8"
427+
418428
return self.encoding
419429

420430
def handleComment(self):
@@ -531,7 +541,7 @@ def getAttribute(self):
531541
#11.5
532542
else:
533543
attrValue.extend(self.data.currentByte)
534-
elif self.data.currentByte in (">", '<'):
544+
elif self.data.currentByte in (">", "<"):
535545
return "".join(attrName), ""
536546
elif self.data.currentByte in asciiUppercase:
537547
attrValue.extend(self.data.currentByte.lower())
@@ -540,7 +550,7 @@ def getAttribute(self):
540550
while True:
541551
self.data.position +=1
542552
if self.data.currentByte in (
543-
list(spaceCharacters) + [">", '<']):
553+
list(spaceCharacters) + [">", "<"]):
544554
return "".join(attrName), "".join(attrValue)
545555
elif self.data.currentByte in asciiUppercase:
546556
attrValue.extend(self.data.currentByte.lower())

src/html5lib/sanitizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,11 @@ def sanitize_css(self, style):
188188
return ' '.join(clean)
189189

190190
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
191-
def __init__(self, stream, encoding=None, parseMeta=True,
191+
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
192192
lowercaseElementName=False, lowercaseAttrName=False):
193193
#Change case matching defaults as we only output lowercase html anyway
194194
#This solution doesn't seem ideal...
195-
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
195+
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
196196
lowercaseElementName, lowercaseAttrName)
197197

198198
def __iter__(self):

src/html5lib/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ class HTMLTokenizer(object):
3030

3131
# XXX need to fix documentation
3232

33-
def __init__(self, stream, encoding=None, parseMeta=True,
33+
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
3434
lowercaseElementName=True, lowercaseAttrName=True,):
35-
self.stream = HTMLInputStream(stream, encoding, parseMeta)
35+
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
3636

3737
#Perform case conversions?
3838
self.lowercaseElementName = lowercaseElementName

tests/test_encoding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def buildTestSuite():
1616
for idx, test in enumerate(tests):
1717
def encodingTest(self, data=test['data'], encoding=test['encoding']):
1818
stream = inputstream.HTMLInputStream(data,chardet=False)
19-
self.assertEquals(encoding.lower(), stream.charEncoding)
19+
self.assertEquals(encoding.lower(), stream.charEncoding[0])
2020
setattr(Html5EncodingTestCase, 'test_%s_%d' % (test_name, idx+1),
2121
encodingTest)
2222

@@ -25,7 +25,7 @@ def encodingTest(self, data=test['data'], encoding=test['encoding']):
2525
def test_chardet(self):
2626
data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read()
2727
encoding = inputstream.HTMLInputStream(data).charEncoding
28-
assert encoding.lower() == "big5"
28+
assert encoding[0].lower() == "big5"
2929
setattr(Html5EncodingTestCase, 'test_chardet', test_chardet)
3030
except ImportError:
3131
print "chardet not found, skipping chardet tests"

tests/test_stream.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class HTMLInputStreamTest(unittest.TestCase):
77

88
def test_char_ascii(self):
99
stream = HTMLInputStream("'", encoding='ascii')
10-
self.assertEquals(stream.charEncoding, 'ascii')
10+
self.assertEquals(stream.charEncoding[0], 'ascii')
1111
self.assertEquals(stream.char(), "'")
1212

1313
def test_char_null(self):
@@ -16,24 +16,24 @@ def test_char_null(self):
1616

1717
def test_char_utf8(self):
1818
stream = HTMLInputStream(u'\u2018'.encode('utf-8'), encoding='utf-8')
19-
self.assertEquals(stream.charEncoding, 'utf-8')
19+
self.assertEquals(stream.charEncoding[0], 'utf-8')
2020
self.assertEquals(stream.char(), u'\u2018')
2121

2222
def test_char_win1252(self):
2323
stream = HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252'))
24-
self.assertEquals(stream.charEncoding, 'windows-1252')
24+
self.assertEquals(stream.charEncoding[0], 'windows-1252')
2525
self.assertEquals(stream.char(), u"\xa9")
2626
self.assertEquals(stream.char(), u"\xf1")
2727
self.assertEquals(stream.char(), u"\u2019")
2828

2929
def test_bom(self):
3030
stream = HTMLInputStream(codecs.BOM_UTF8 + "'")
31-
self.assertEquals(stream.charEncoding, 'utf-8')
31+
self.assertEquals(stream.charEncoding[0], 'utf-8')
3232
self.assertEquals(stream.char(), "'")
3333

3434
def test_utf_16(self):
3535
stream = HTMLInputStream((' '*1025).encode('utf-16'))
36-
self.assert_(stream.charEncoding in ['utf-16-le','utf-16-be'])
36+
self.assert_(stream.charEncoding[0] in ['utf-16-le','utf-16-be'], stream.charEncoding)
3737
self.assertEquals(len(stream.charsUntil(' ',True)),1025)
3838

3939
def test_newlines(self):

0 commit comments

Comments
 (0)