Skip to content

Commit 447b711

Browse files
committed
Don't crash and burn when non-ascii characters are found in the pre-parse
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401060
1 parent b24ea63 commit 447b711

File tree

2 files changed

+18
-13
lines changed

2 files changed

+18
-13
lines changed

src/html5lib/inputstream.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
from constants import encodings
77
from utils import MethodDispatcher
88

9+
#Non-unicode versions of constants for use in the pre-parser
10+
spaceCharactersBytes = [str(item) for item in spaceCharacters]
11+
asciiLettersBytes = [str(item) for item in asciiLetters]
12+
asciiUppercaseBytes = [str(item) for item in asciiUppercase]
13+
914
try:
1015
from collections import deque
1116
except ImportError:
@@ -357,7 +362,7 @@ def getCurrentByte(self):
357362

358363
currentByte = property(getCurrentByte)
359364

360-
def skip(self, chars=spaceCharacters):
365+
def skip(self, chars=spaceCharactersBytes):
361366
"""Skip past a list of characters"""
362367
while self.currentByte in chars:
363368
self.position += 1
@@ -432,7 +437,7 @@ def handleComment(self):
432437
return self.data.jumpTo("-->")
433438

434439
def handleMeta(self):
435-
if self.data.currentByte not in spaceCharacters:
440+
if self.data.currentByte not in spaceCharactersBytes:
436441
#if we have <meta not followed by a space so just keep going
437442
return True
438443
#We have a valid meta element we want to search for attributes
@@ -462,7 +467,7 @@ def handlePossibleEndTag(self):
462467
return self.handlePossibleTag(True)
463468

464469
def handlePossibleTag(self, endTag):
465-
if self.data.currentByte not in asciiLetters:
470+
if self.data.currentByte not in asciiLettersBytes:
466471
#If the next byte is not an ascii letter either ignore this
467472
#fragment (possible start tag case) or treat it according to
468473
#handleOther
@@ -471,7 +476,7 @@ def handlePossibleTag(self, endTag):
471476
self.handleOther()
472477
return True
473478

474-
self.data.findNext(list(spaceCharacters) + ["<", ">"])
479+
self.data.findNext(list(spaceCharactersBytes) + ["<", ">"])
475480
if self.data.currentByte == "<":
476481
#return to the first step in the overall "two step" algorithm
477482
#reprocessing the < byte
@@ -489,7 +494,7 @@ def handleOther(self):
489494
def getAttribute(self):
490495
"""Return a name,value pair for the next attribute in the stream,
491496
if one is found, or None"""
492-
self.data.skip(list(spaceCharacters)+["/"])
497+
self.data.skip(list(spaceCharactersBytes)+["/"])
493498
if self.data.currentByte == "<":
494499
self.data.position -= 1
495500
return None
@@ -502,12 +507,12 @@ def getAttribute(self):
502507
while True:
503508
if self.data.currentByte == "=" and attrName:
504509
break
505-
elif self.data.currentByte in spaceCharacters:
510+
elif self.data.currentByte in spaceCharactersBytes:
506511
spaceFound=True
507512
break
508513
elif self.data.currentByte in ("/", "<", ">"):
509514
return "".join(attrName), ""
510-
elif self.data.currentByte in asciiUppercase:
515+
elif self.data.currentByte in asciiUppercaseBytes:
511516
attrName.extend(self.data.currentByte.lower())
512517
else:
513518
attrName.extend(self.data.currentByte)
@@ -536,23 +541,23 @@ def getAttribute(self):
536541
self.data.position += 1
537542
return "".join(attrName), "".join(attrValue)
538543
#11.4
539-
elif self.data.currentByte in asciiUppercase:
544+
elif self.data.currentByte in asciiUppercaseBytes:
540545
attrValue.extend(self.data.currentByte.lower())
541546
#11.5
542547
else:
543548
attrValue.extend(self.data.currentByte)
544549
elif self.data.currentByte in (">", "<"):
545550
return "".join(attrName), ""
546-
elif self.data.currentByte in asciiUppercase:
551+
elif self.data.currentByte in asciiUppercaseBytes:
547552
attrValue.extend(self.data.currentByte.lower())
548553
else:
549554
attrValue.extend(self.data.currentByte)
550555
while True:
551556
self.data.position +=1
552557
if self.data.currentByte in (
553-
list(spaceCharacters) + [">", "<"]):
558+
list(spaceCharactersBytes) + [">", "<"]):
554559
return "".join(attrName), "".join(attrValue)
555-
elif self.data.currentByte in asciiUppercase:
560+
elif self.data.currentByte in asciiUppercaseBytes:
556561
attrValue.extend(self.data.currentByte.lower())
557562
else:
558563
attrValue.extend(self.data.currentByte)
@@ -588,7 +593,7 @@ def parse(self):
588593
#Unquoted value
589594
oldPosition = self.data.position
590595
try:
591-
self.data.findNext(spaceCharacters)
596+
self.data.findNext(spaceCharactersBytes)
592597
return self.data[oldPosition:self.data.position]
593598
except StopIteration:
594599
#Return the whole remaining value

src/html5lib/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class HTMLTokenizer(object):
3131
# XXX need to fix documentation
3232

3333
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
34-
lowercaseElementName=True, lowercaseAttrName=True,):
34+
lowercaseElementName=True, lowercaseAttrName=True):
3535
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
3636

3737
#Perform case conversions?

0 commit comments

Comments
 (0)