Check for invalid codepoints in input stream

jgraham · jgraham · commit afe181dd9ba8 · 2008-04-07T21:35:38.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401141
diff --git a/src/html5lib/constants.py b/src/html5lib/constants.py
@@ -13,6 +13,8 @@
 E = {
     "null-character": 
        _(u"Null character in input stream, replaced with U+FFFD."),
+    "invalid-character": 
+       _(u"Invalid codepoint in stream."),
     "incorrectly-placed-solidus":
        _(u"Solidus (/) incorrectly placed in tag."),
     "incorrect-cr-newline-entity":
@@ -1052,4 +1054,4 @@
     ))
 
 class DataLossWarning(UserWarning):
-    pass
+    pass
diff --git a/src/html5lib/inputstream.py b/src/html5lib/inputstream.py
@@ -11,6 +11,8 @@
 asciiLettersBytes = [str(item) for item in asciiLetters]
 asciiUppercaseBytes = [str(item) for item in asciiUppercase]
 
+invalid_unicode_re = re.compile(u"[\u0001-\u0008]|[\u000E-\u001F]|[\u007F-\u009F]|[\uD800-\uDFFF]|[\uFDD0-\uFDDF]|\uFFFE|\uFFFF|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE\U000BFFFF|\U000CFFFE|\U000CFFFF|\U000DFFFE|\U000DFFFF|\U000EFFFE|\U000EFFFF|\U000FFFFE|\U000FFFFF|\U0010FFFE|\U0010FFFF")
+
 try:
     from collections import deque
 except ImportError:
@@ -28,7 +30,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
-        for use by the HTML5Lib.
+        for use by html5lib.
 
         source can be either a file-object, local filename or a string.
 
@@ -59,7 +61,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         self.defaultEncoding = "windows-1252"
         
         #Detect encoding iff no explicit "transport level" encoding is supplied
-        if self.charEncoding[0] is None or not isValidEncoding(self.charEncoding[0]):
+        if (self.charEncoding[0] is None or
+            not isValidEncoding(self.charEncoding[0])):
             self.charEncoding = self.detectEncoding(parseMeta, chardet)
 
         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
@@ -87,7 +90,7 @@ def openStream(self, source):
             # Otherwise treat source as a string and convert to a file object
             if isinstance(source, unicode):
                 source = source.encode('utf-8')
-                self.charEncoding = "utf-8"
+                self.charEncoding = ("utf-8", "certian")
             import cStringIO
             stream = cStringIO.StringIO(str(source))
         return stream
@@ -262,6 +265,9 @@ def readChunk(self, chunkSize=10240):
         #Replace null characters
         for i in xrange(data.count(u"\u0000")):
             self.errors.append("null-character")
+        for i in xrange(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
         data = data.replace(u"\u0000", u"\ufffd")
         #Check for CR LF broken across chunks
         if (self._lastChunkEndsWithCR and data[0] == "\n"):
@@ -271,7 +277,7 @@ def readChunk(self, chunkSize=10240):
         data = data.replace("\r", "\n")
         
         data = unicode(data)
-        self.queue.extend([char for char in data])
+        self.queue.extend(list(data))
 
         self.updatePosition()
 
diff --git a/src/html5lib/tokenizer.py b/src/html5lib/tokenizer.py
@@ -4,7 +4,11 @@
     # Import from the sets module for python 2.3
     from sets import Set as set
     from sets import ImmutableSet as frozenset
-
+try:
+    from collections import deque
+except ImportError:
+    from utils import deque
+    
 from constants import contentModelFlags, spaceCharacters
 from constants import entitiesWindows1252, entities
 from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
@@ -83,24 +87,21 @@ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
         # The current token being created
         self.currentToken = None
 
-        # Tokens to be processed.
-        self.tokenQueue = []
-
     def __iter__(self):
         """ This is where the magic happens.
 
         We do our usually processing through the states and when we have a token
         to return we yield the token which pauses processing until the next token
         is requested.
         """
-        self.tokenQueue = []
+        self.tokenQueue = deque([])
         # Start processing. When EOF is reached self.state will return False
         # instead of True and the loop will terminate.
         while self.state():
             while self.stream.errors:
                 yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
             while self.tokenQueue:
-                yield self.tokenQueue.pop(0)
+                yield self.tokenQueue.popleft()
 
     # Below are various helper functions the tokenizer states use worked out.
     def processSolidusInTag(self):