Skip to content

Commit db43ce2

Browse files
committed
Get encoding tests passing, and test the pre-scan separately
1 parent 2816de7 commit db43ce2

File tree

3 files changed

+85
-49
lines changed

3 files changed

+85
-49
lines changed

html5lib/html5parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,9 @@ def startTagMeta(self, token):
777777
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
778778
if "charset" in attributes:
779779
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
780-
elif "content" in attributes:
780+
elif ("content" in attributes and
781+
"http-equiv" in attributes and
782+
attributes["http-equiv"].lower() == "content-type"):
781783
# Encoding it as UTF-8 here is a hack, as really we should pass
782784
# the abstract Unicode string, and just use the
783785
# ContentAttrParser on that, but using UTF-8 allows all chars

html5lib/inputstream.py

Lines changed: 66 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ class BufferedIOBase(object):
2121
pass
2222

2323
#Non-unicode versions of constants for use in the pre-parser
24-
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
25-
asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
26-
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
27-
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
24+
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
25+
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
26+
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
27+
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
2828

2929
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
3030

@@ -391,12 +391,14 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
391391
parseMeta - Look for a <meta> element containing encoding information
392392
393393
"""
394-
self.charEncoding = (codecName(encoding), "certain")
395-
396394
# Raw Stream - for unicode objects this will encode to utf-8 and set
397395
# self.charEncoding as appropriate
398396
self.rawStream = self.openStream(source)
399397

398+
HTMLUnicodeInputStream.__init__(self, self.rawStream)
399+
400+
self.charEncoding = (codecName(encoding), "certain")
401+
400402
# Encoding Information
401403
#Number of bytes to use when looking for a meta element with
402404
#encoding information
@@ -411,7 +413,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
411413
self.charEncoding = self.detectEncoding(parseMeta, chardet)
412414

413415
#Call superclass
414-
HTMLUnicodeInputStream.__init__(self, self.rawStream)
416+
self.reset()
415417

416418
def reset(self):
417419
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
@@ -538,12 +540,13 @@ def detectEncodingMeta(self):
538540

539541
return encoding
540542

541-
class EncodingBytes(str):
543+
class EncodingBytes(bytes):
542544
"""String-like object with an associated position and various extra methods
543545
If the position is ever greater than the string length then an exception is
544546
raised"""
545547
def __new__(self, value):
546-
return str.__new__(self, value.lower())
548+
assert isinstance(value, bytes)
549+
return bytes.__new__(self, value.lower())
547550

548551
def __init__(self, value):
549552
self._position=-1
@@ -557,7 +560,7 @@ def __next__(self):
557560
raise StopIteration
558561
elif p < 0:
559562
raise TypeError
560-
return self[p]
563+
return self[p:p+1]
561564

562565
def previous(self):
563566
p = self._position
@@ -566,7 +569,7 @@ def previous(self):
566569
elif p < 0:
567570
raise TypeError
568571
self._position = p = p - 1
569-
return self[p]
572+
return self[p:p+1]
570573

571574
def setPosition(self, position):
572575
if self._position >= len(self):
@@ -584,15 +587,15 @@ def getPosition(self):
584587
position = property(getPosition, setPosition)
585588

586589
def getCurrentByte(self):
587-
return self[self.position]
590+
return self[self.position:self.position+1]
588591

589592
currentByte = property(getCurrentByte)
590593

591594
def skip(self, chars=spaceCharactersBytes):
592595
"""Skip past a list of characters"""
593596
p = self.position # use property for the error-checking
594597
while p < len(self):
595-
c = self[p]
598+
c = self[p:p+1]
596599
if c not in chars:
597600
self._position = p
598601
return c
@@ -603,7 +606,7 @@ def skip(self, chars=spaceCharactersBytes):
603606
def skipUntil(self, chars):
604607
p = self.position
605608
while p < len(self):
606-
c = self[p]
609+
c = self[p:p+1]
607610
if c in chars:
608611
self._position = p
609612
return c
@@ -645,12 +648,12 @@ def __init__(self, data):
645648

646649
def getEncoding(self):
647650
methodDispatch = (
648-
("<!--",self.handleComment),
649-
("<meta",self.handleMeta),
650-
("</",self.handlePossibleEndTag),
651-
("<!",self.handleOther),
652-
("<?",self.handleOther),
653-
("<",self.handlePossibleStartTag))
651+
(b"<!--",self.handleComment),
652+
(b"<meta",self.handleMeta),
653+
(b"</",self.handlePossibleEndTag),
654+
(b"<!",self.handleOther),
655+
(b"<?",self.handleOther),
656+
(b"<",self.handlePossibleStartTag))
654657
for byte in self.data:
655658
keepParsing = True
656659
for key, method in methodDispatch:
@@ -663,37 +666,48 @@ def getEncoding(self):
663666
break
664667
if not keepParsing:
665668
break
666-
669+
667670
return self.encoding
668671

669672
def handleComment(self):
670673
"""Skip over comments"""
671-
return self.data.jumpTo("-->")
674+
return self.data.jumpTo(b"-->")
672675

673676
def handleMeta(self):
674677
if self.data.currentByte not in spaceCharactersBytes:
675678
#if we have <meta not followed by a space so just keep going
676679
return True
677680
#We have a valid meta element we want to search for attributes
681+
hasPragma = False
682+
pendingEncoding = None
678683
while True:
679684
#Try to find the next attribute after the current position
680685
attr = self.getAttribute()
681686
if attr is None:
682687
return True
683688
else:
684-
if attr[0] == "charset":
689+
if attr[0] == b"http-equiv":
690+
hasPragma = attr[1] == b"content-type"
691+
if hasPragma and pendingEncoding is not None:
692+
self.encoding = pendingEncoding
693+
return False
694+
elif attr[0] == b"charset":
685695
tentativeEncoding = attr[1]
686696
codec = codecName(tentativeEncoding)
687697
if codec is not None:
688698
self.encoding = codec
689699
return False
690-
elif attr[0] == "content":
700+
elif attr[0] == b"content":
691701
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
692702
tentativeEncoding = contentParser.parse()
693-
codec = codecName(tentativeEncoding)
694-
if codec is not None:
695-
self.encoding = codec
696-
return False
703+
if tentativeEncoding is not None:
704+
codec = codecName(tentativeEncoding)
705+
if codec is not None:
706+
if hasPragma:
707+
self.encoding = codec
708+
return False
709+
else:
710+
pendingEncoding = codec
697711

698712
def handlePossibleStartTag(self):
699713
return self.handlePossibleTag(False)
@@ -714,7 +728,7 @@ def handlePossibleTag(self, endTag):
714728
return True
715729

716730
c = data.skipUntil(spacesAngleBrackets)
717-
if c == "<":
731+
if c == b"<":
718732
#return to the first step in the overall "two step" algorithm
719733
#reprocessing the < byte
720734
data.previous()
@@ -726,31 +740,31 @@ def handlePossibleTag(self, endTag):
726740
return True
727741

728742
def handleOther(self):
729-
return self.data.jumpTo(">")
743+
return self.data.jumpTo(b">")
730744

731745
def getAttribute(self):
732746
"""Return a name,value pair for the next attribute in the stream,
733747
if one is found, or None"""
734748
data = self.data
735749
# Step 1 (skip chars)
736-
c = data.skip(spaceCharactersBytes | frozenset("/"))
750+
c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
751+
assert c is None or len(c) == 1
737752
# Step 2
738-
if c in (">", None):
753+
if c in (b">", None):
739754
return None
740755
# Step 3
741756
attrName = []
742757
attrValue = []
743758
#Step 4 attribute name
744759
while True:
745-
if c == "=" and attrName:
760+
if c == b"=" and attrName:
746761
break
747762
elif c in spaceCharactersBytes:
748763
#Step 6!
749764
c = data.skip()
750-
c = next(data)
751765
break
752-
elif c in ("/", ">"):
753-
return "".join(attrName), ""
766+
elif c in (b"/", b">"):
767+
return b"".join(attrName), b""
754768
elif c in asciiUppercaseBytes:
755769
attrName.append(c.lower())
756770
elif c == None:
@@ -760,15 +774,15 @@ def getAttribute(self):
760774
#Step 5
761775
c = next(data)
762776
#Step 7
763-
if c != "=":
777+
if c != b"=":
764778
data.previous()
765-
return "".join(attrName), ""
779+
return b"".join(attrName), b""
766780
#Step 8
767781
next(data)
768782
#Step 9
769783
c = data.skip()
770784
#Step 10
771-
if c in ("'", '"'):
785+
if c in (b"'", b'"'):
772786
#10.1
773787
quoteChar = c
774788
while True:
@@ -777,15 +791,15 @@ def getAttribute(self):
777791
#10.3
778792
if c == quoteChar:
779793
next(data)
780-
return "".join(attrName), "".join(attrValue)
794+
return b"".join(attrName), b"".join(attrValue)
781795
#10.4
782796
elif c in asciiUppercaseBytes:
783797
attrValue.append(c.lower())
784798
#10.5
785799
else:
786800
attrValue.append(c)
787-
elif c == ">":
788-
return "".join(attrName), ""
801+
elif c == b">":
802+
return b"".join(attrName), b""
789803
elif c in asciiUppercaseBytes:
790804
attrValue.append(c.lower())
791805
elif c is None:
@@ -796,7 +810,7 @@ def getAttribute(self):
796810
while True:
797811
c = next(data)
798812
if c in spacesAngleBrackets:
799-
return "".join(attrName), "".join(attrValue)
813+
return b"".join(attrName), b"".join(attrValue)
800814
elif c in asciiUppercaseBytes:
801815
attrValue.append(c.lower())
802816
elif c is None:
@@ -807,21 +821,22 @@ def getAttribute(self):
807821

808822
class ContentAttrParser(object):
809823
def __init__(self, data):
824+
assert isinstance(data, bytes)
810825
self.data = data
811826
def parse(self):
812827
try:
813828
#Check if the attr name is charset
814829
#otherwise return
815-
self.data.jumpTo("charset")
830+
self.data.jumpTo(b"charset")
816831
self.data.position += 1
817832
self.data.skip()
818-
if not self.data.currentByte == "=":
833+
if not self.data.currentByte == b"=":
819834
#If there is no = sign keep looking for attrs
820835
return None
821836
self.data.position += 1
822837
self.data.skip()
823838
#Look for an encoding between matching quote marks
824-
if self.data.currentByte in ('"', "'"):
839+
if self.data.currentByte in (b'"', b"'"):
825840
quoteMark = self.data.currentByte
826841
self.data.position += 1
827842
oldPosition = self.data.position
@@ -845,6 +860,11 @@ def parse(self):
845860
def codecName(encoding):
846861
"""Return the python codec name corresponding to an encoding or None if the
847862
string doesn't correspond to a valid encoding."""
863+
if isinstance(encoding, bytes):
864+
try:
865+
encoding = encoding.decode("ascii")
866+
except UnicodeDecodeError:
867+
return None
848868
if encoding:
849869
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
850870
return encodings.get(canonicalName, None)

html5lib/tests/test_encoding.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_codec_name_c(self):
2323
def test_codec_name_d(self):
2424
self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
2525

26-
def runEncodingTest(data, encoding):
26+
def runParserEncodingTest(data, encoding):
2727
p = HTMLParser()
2828
t = p.parse(data, useChardet=False)
2929
encoding = encoding.lower().decode("ascii")
@@ -33,13 +33,27 @@ def runEncodingTest(data, encoding):
3333
repr(p.tokenizer.stream.charEncoding[0])))
3434
assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage
3535

36+
37+
def runPreScanEncodingTest(data, encoding):
38+
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
39+
encoding = encoding.lower().decode("ascii")
40+
41+
if len(data) > stream.numBytesMeta:
42+
return
43+
44+
errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
45+
(data, repr(encoding),
46+
repr(stream.charEncoding[0])))
47+
assert encoding == stream.charEncoding[0], errorMessage
48+
3649
def test_encoding():
3750
for filename in get_data_files("encoding"):
3851
test_name = os.path.basename(filename).replace('.dat',''). \
3952
replace('-','')
4053
tests = TestData(filename, b"data", encoding=None)
4154
for idx, test in enumerate(tests):
42-
yield (runEncodingTest, test[b'data'], test[b'encoding'])
55+
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
56+
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
4357

4458
try:
4559
import chardet

0 commit comments

Comments
 (0)