Skip to content

Commit 47b430e

Browse files
committed
Update encoding detection to spec, make the entire encoding detection case-insensitive (it is effectively in spec), and fix bug shown by the update in the encoding stream.
1 parent 19f48dd commit 47b430e

File tree

1 file changed

+37
-35
lines changed

1 file changed

+37
-35
lines changed

src/html5lib/inputstream.py

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ class EncodingBytes(str):
471471
If the position is ever greater than the string length then an exception is
472472
raised"""
473473
def __new__(self, value):
474-
return str.__new__(self, value)
474+
return str.__new__(self, value.lower())
475475

476476
def __init__(self, value):
477477
self._position=-1
@@ -539,14 +539,12 @@ def skipUntil(self, chars):
539539
self._position = p
540540
return None
541541

542-
def matchBytes(self, bytes, lower=False):
542+
def matchBytes(self, bytes):
543543
"""Look for a sequence of bytes at the start of a string. If the bytes
544544
are found return True and advance the position to the byte after the
545545
match. Otherwise return False and leave the position alone"""
546546
p = self.position
547547
data = self[p:p+len(bytes)]
548-
if lower:
549-
data = data.lower()
550548
rv = data.startswith(bytes)
551549
if rv:
552550
self.position += len(bytes)
@@ -557,6 +555,9 @@ def jumpTo(self, bytes):
557555
a match is found advance the position to the last byte of the match"""
558556
newPosition = self[self.position:].find(bytes)
559557
if newPosition > -1:
558+
# XXX: This is ugly, but I can't see a nicer way to fix this.
559+
if self._position == -1:
560+
self._position = 0
560561
self._position += (newPosition + len(bytes)-1)
561562
return True
562563
else:
@@ -581,7 +582,7 @@ def getEncoding(self):
581582
for byte in self.data:
582583
keepParsing = True
583584
for key, method in methodDispatch:
584-
if self.data.matchBytes(key, lower=True):
585+
if self.data.matchBytes(key):
585586
try:
586587
keepParsing = method()
587588
break
@@ -659,72 +660,75 @@ def getAttribute(self):
659660
"""Return a name,value pair for the next attribute in the stream,
660661
if one is found, or None"""
661662
data = self.data
663+
# Step 1 (skip chars)
662664
c = data.skip(spaceCharactersBytes | frozenset("/"))
663-
if c == "<":
664-
data.previous()
665-
return None
666-
elif c == ">" or c is None:
665+
# Step 2
666+
if c in (">", None):
667667
return None
668+
# Step 3
668669
attrName = []
669670
attrValue = []
670-
spaceFound = False
671-
#Step 5 attribute name
671+
#Step 4 attribute name
672672
while True:
673673
if c == "=" and attrName:
674674
break
675675
elif c in spaceCharactersBytes:
676-
spaceFound=True
676+
#Step 6!
677+
c = data.skip()
678+
c = data.next()
677679
break
678-
elif c in ("/", "<", ">"):
680+
elif c in ("/", ">"):
679681
return "".join(attrName), ""
680682
elif c in asciiUppercaseBytes:
681683
attrName.append(c.lower())
684+
elif c == None:
685+
return None
682686
else:
683687
attrName.append(c)
684-
#Step 6
688+
#Step 5
685689
c = data.next()
686690
#Step 7
687-
if spaceFound:
688-
c = data.skip()
689-
#Step 8
690-
if c != "=":
691-
data.previous()
692-
return "".join(attrName), ""
693-
#XXX need to advance position in both spaces and value case
694-
#Step 9
691+
if c != "=":
692+
data.previous()
693+
return "".join(attrName), ""
694+
#Step 8
695695
data.next()
696-
#Step 10
696+
#Step 9
697697
c = data.skip()
698-
#Step 11
698+
#Step 10
699699
if c in ("'", '"'):
700-
#11.1
700+
#10.1
701701
quoteChar = c
702702
while True:
703-
#11.3
703+
#10.2
704704
c = data.next()
705+
#10.3
705706
if c == quoteChar:
706707
data.next()
707708
return "".join(attrName), "".join(attrValue)
708-
#11.4
709+
#10.4
709710
elif c in asciiUppercaseBytes:
710711
attrValue.append(c.lower())
711-
#11.5
712+
#10.5
712713
else:
713714
attrValue.append(c)
714-
elif c in (">", "<"):
715+
elif c == ">":
715716
return "".join(attrName), ""
716717
elif c in asciiUppercaseBytes:
717718
attrValue.append(c.lower())
718719
elif c is None:
719720
return None
720721
else:
721722
attrValue.append(c)
723+
# Step 11
722724
while True:
723725
c = data.next()
724726
if c in spacesAngleBrackets:
725727
return "".join(attrName), "".join(attrValue)
726728
elif c in asciiUppercaseBytes:
727729
attrValue.append(c.lower())
730+
elif c is None:
731+
return None
728732
else:
729733
attrValue.append(c)
730734

@@ -734,10 +738,6 @@ def __init__(self, data):
734738
self.data = data
735739
def parse(self):
736740
try:
737-
#Skip to the first ";"
738-
self.data.jumpTo(";")
739-
self.data.position += 1
740-
self.data.skip()
741741
#Check if the attr name is charset
742742
#otherwise return
743743
self.data.jumpTo("charset")
@@ -753,8 +753,10 @@ def parse(self):
753753
quoteMark = self.data.currentByte
754754
self.data.position += 1
755755
oldPosition = self.data.position
756-
self.data.jumpTo(quoteMark)
757-
return self.data[oldPosition:self.data.position]
756+
if self.data.jumpTo(quoteMark):
757+
return self.data[oldPosition:self.data.position]
758+
else:
759+
return None
758760
else:
759761
#Unquoted value
760762
oldPosition = self.data.position

0 commit comments

Comments
 (0)