Skip to content

Commit afe181d

Browse files
committed
Check for invalid codepoints in input stream
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401141
1 parent bd4ad51 commit afe181d

File tree

3 files changed

+20
-11
lines changed

3 files changed

+20
-11
lines changed

src/html5lib/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
E = {
1414
"null-character":
1515
_(u"Null character in input stream, replaced with U+FFFD."),
16+
"invalid-character":
17+
_(u"Invalid codepoint in stream."),
1618
"incorrectly-placed-solidus":
1719
_(u"Solidus (/) incorrectly placed in tag."),
1820
"incorrect-cr-newline-entity":
@@ -1052,4 +1054,4 @@
10521054
))
10531055

10541056
class DataLossWarning(UserWarning):
1055-
pass
1057+
pass

src/html5lib/inputstream.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
asciiLettersBytes = [str(item) for item in asciiLetters]
1212
asciiUppercaseBytes = [str(item) for item in asciiUppercase]
1313

14+
invalid_unicode_re = re.compile(u"[\u0001-\u0008]|[\u000E-\u001F]|[\u007F-\u009F]|[\uD800-\uDFFF]|[\uFDD0-\uFDDF]|\uFFFE|\uFFFF|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE\U000BFFFF|\U000CFFFE|\U000CFFFF|\U000DFFFE|\U000DFFFF|\U000EFFFE|\U000EFFFF|\U000FFFFE|\U000FFFFF|\U0010FFFE|\U0010FFFF")
15+
1416
try:
1517
from collections import deque
1618
except ImportError:
@@ -28,7 +30,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
2830
"""Initialises the HTMLInputStream.
2931
3032
HTMLInputStream(source, [encoding]) -> Normalized stream from source
31-
for use by the HTML5Lib.
33+
for use by html5lib.
3234
3335
source can be either a file-object, local filename or a string.
3436
@@ -59,7 +61,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5961
self.defaultEncoding = "windows-1252"
6062

6163
#Detect encoding iff no explicit "transport level" encoding is supplied
62-
if self.charEncoding[0] is None or not isValidEncoding(self.charEncoding[0]):
64+
if (self.charEncoding[0] is None or
65+
not isValidEncoding(self.charEncoding[0])):
6366
self.charEncoding = self.detectEncoding(parseMeta, chardet)
6467

6568
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
@@ -87,7 +90,7 @@ def openStream(self, source):
8790
# Otherwise treat source as a string and convert to a file object
8891
if isinstance(source, unicode):
8992
source = source.encode('utf-8')
90-
self.charEncoding = "utf-8"
93+
self.charEncoding = ("utf-8", "certian")
9194
import cStringIO
9295
stream = cStringIO.StringIO(str(source))
9396
return stream
@@ -262,6 +265,9 @@ def readChunk(self, chunkSize=10240):
262265
#Replace null characters
263266
for i in xrange(data.count(u"\u0000")):
264267
self.errors.append("null-character")
268+
for i in xrange(len(invalid_unicode_re.findall(data))):
269+
self.errors.append("invalid-codepoint")
270+
265271
data = data.replace(u"\u0000", u"\ufffd")
266272
#Check for CR LF broken across chunks
267273
if (self._lastChunkEndsWithCR and data[0] == "\n"):
@@ -271,7 +277,7 @@ def readChunk(self, chunkSize=10240):
271277
data = data.replace("\r", "\n")
272278

273279
data = unicode(data)
274-
self.queue.extend([char for char in data])
280+
self.queue.extend(list(data))
275281

276282
self.updatePosition()
277283

src/html5lib/tokenizer.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
# Import from the sets module for python 2.3
55
from sets import Set as set
66
from sets import ImmutableSet as frozenset
7-
7+
try:
8+
from collections import deque
9+
except ImportError:
10+
from utils import deque
11+
812
from constants import contentModelFlags, spaceCharacters
913
from constants import entitiesWindows1252, entities
1014
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
@@ -83,24 +87,21 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
8387
# The current token being created
8488
self.currentToken = None
8589

86-
# Tokens to be processed.
87-
self.tokenQueue = []
88-
8990
def __iter__(self):
9091
""" This is where the magic happens.
9192
9293
We do our usually processing through the states and when we have a token
9394
to return we yield the token which pauses processing until the next token
9495
is requested.
9596
"""
96-
self.tokenQueue = []
97+
self.tokenQueue = deque([])
9798
# Start processing. When EOF is reached self.state will return False
9899
# instead of True and the loop will terminate.
99100
while self.state():
100101
while self.stream.errors:
101102
yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
102103
while self.tokenQueue:
103-
yield self.tokenQueue.pop(0)
104+
yield self.tokenQueue.popleft()
104105

105106
# Below are various helper functions the tokenizer states use worked out.
106107
def processSolidusInTag(self):

0 commit comments

Comments
 (0)