groklearning
diff --git a/‎parse.py
Lines changed: 3 additions & 2 deletions b/‎parse.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/html5lib/constants.py
Lines changed: 17 additions & 0 deletions b/‎src/html5lib/constants.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/html5lib/html5parser.py
Lines changed: 612 additions & 598 deletions b/‎src/html5lib/html5parser.py
Lines changed: 612 additions & 598 deletions
diff --git a/‎src/html5lib/inputstream.py
Lines changed: 102 additions & 58 deletions b/‎src/html5lib/inputstream.py
Lines changed: 102 additions & 58 deletions
@@ -12,7 +12,8 @@
 #RELEASE remove
 sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
 #END RELEASE
-from html5lib import html5parser, liberalxmlparser, sanitizer, tokenizer
+from html5lib import html5parser, liberalxmlparser, sanitizer
+from html5lib.tokenizer import HTMLTokenizer
 from html5lib import treebuilders, serializer, treewalkers
 from html5lib import constants
 
@@ -80,7 +81,7 @@ def parse():
         t1 = time.time()
         printOutput(p, document, opts)
         t2 = time.time()
-        print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
+        sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
     else:
         document = parseMethod(f, encoding=encoding)
         printOutput(p, document, opts)
 
@@ -72,6 +72,10 @@
        _(u"Unexpected end of file in attribute value (')."),
     "eof-in-attribute-value-no-quotes":
        _(u"Unexpected end of file in attribute value."),
+    "unexpected-EOF-after-solidus-in-tag":
+        _(u"Unexpected end of file in tag. Expected >"),
+    "unexpected-character-after-soldius-in-tag":
+        _(u"Unexpected character after / in tag. Expected >"),
     "expected-dashes-or-doctype":
        _(u"Expected '--' or 'DOCTYPE'. Not found."),
     "incorrect-comment":
@@ -1098,5 +1102,18 @@
     "ParseError":7
 }
 
+namespaces = {
+    "html":"http://www.w3.org/1999/xhtml",
+    "mathml":"http://www.w3.org/1998/Math/MathML",
+    "svg":"http://www.w3.org/2000/svg",
+    "xlink":"http://www.w3.org/1999/xlink",
+    "xml":"http://www.w3.org/XML/1998/namespace",
+    "xmlns":"http://www.w3.org/2000/xmlns/"
+}
+
+
 class DataLossWarning(UserWarning):
     pass
+
+class ReparseException(Exception):
+    pass
@@ -3,7 +3,7 @@
 import types
 
 from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings
+from constants import encodings, ReparseException
 
 #Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = [str(item) for item in spaceCharacters]
@@ -16,6 +16,82 @@
 
 # Cache for charsUntil()
 charsUntilRegEx = {}
+        
+class BufferedStream:
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that 
+    joining many strings will be slow since it is O(n**2)
+    """
+    
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1,0] #chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos < self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= pos
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+    
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+            
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            data = rv.append(bufferedData[bufferOffset: 
+                                          bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+        
+        return "".join(rv)
+        
+
 
 class HTMLInputStream:
     """Provides a unicode stream of characters to the HTMLTokenizer.
@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         if (self.charEncoding[0] is None):
             self.charEncoding = self.detectEncoding(parseMeta, chardet)
 
+        self.reset()
+
+    def reset(self):
         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                  'replace')
 
@@ -100,6 +179,10 @@ def openStream(self, source):
                 self.charEncoding = ("utf-8", "certain")
             import cStringIO
             stream = cStringIO.StringIO(str(source))
+
+        if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
+            stream = BufferedStream(stream)
+
         return stream
 
     def detectEncoding(self, parseMeta=True, chardet=True):
@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
                     detector.feed(buffer)
                 detector.close()
                 encoding = detector.result['encoding']
-                self.seek("".join(buffers), 0)
+                self.rawStream.seek(0)
             except ImportError:
                 pass
         # If all else fails use the default encoding
@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
 
     def changeEncoding(self, newEncoding):
         newEncoding = codecName(newEncoding)
-        if newEncoding == "utf16":
-            newEncoding = "utf8"
-
+        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            newEncoding = "utf-8"
         if newEncoding is None:
             return
         elif newEncoding == self.charEncoding[0]:
-            self.charEncoding = (self.charEncoding[0] and "certian")
+            self.charEncoding = (self.charEncoding[0], "certian")
         else:
-            raise NotImplementedError, "Cannot change character encoding mid stream"
-
+            self.rawStream.seek(0)
+            self.reset()
+            self.charEncoding = (newEncoding, "certian")
+            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
+            
     def detectBOM(self):
         """Attempts to detect at BOM at the start of the stream. If
         an encoding can be determined from the BOM return the name of the
@@ -182,56 +267,21 @@ def detectBOM(self):
 
         # Set the read position past the BOM if one was found, otherwise
         # set it to the start of the stream
-        self.seek(string, encoding and seek or 0)
+        self.rawStream.seek(encoding and seek or 0)
 
         return encoding
 
-    def seek(self, buffer, n):
-        """Unget buffer[n:]"""
-        if hasattr(self.rawStream, 'unget'):
-            self.rawStream.unget(buffer[n:])
-            return 
-
-        if hasattr(self.rawStream, 'seek'):
-            try:
-                self.rawStream.seek(n)
-                return
-            except IOError:
-                pass
-
-        class BufferedStream:
-             def __init__(self, data, stream):
-                 self.data = data
-                 self.stream = stream
-             def read(self, chars=-1):
-                 if chars == -1 or chars > len(self.data):
-                     result = self.data
-                     self.data = ''
-                     if chars == -1:
-                         return result + self.stream.read()
-                     else:
-                         return result + self.stream.read(chars-len(result))
-                 elif not self.data:
-                     return self.stream.read(chars)
-                 else:
-                     result = self.data[:chars]
-                     self.data = self.data[chars:]
-                     return result
-             def unget(self, data):
-                 if self.data:
-                     self.data += data
-                 else:
-                     self.data = data
-
-        self.rawStream = BufferedStream(buffer[n:], self.rawStream)
-
     def detectEncodingMeta(self):
         """Report the encoding declared by the meta element
         """
         buffer = self.rawStream.read(self.numBytesMeta)
         parser = EncodingParser(buffer)
-        self.seek(buffer, 0)
+        self.rawStream.seek(0)
         encoding = parser.getEncoding()
+        
+        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            encoding = "utf-8"
+
         return encoding
 
     def updatePosition(self, chars):
@@ -485,13 +535,6 @@ def getEncoding(self):
                         break
             if not keepParsing:
                 break
-        if self.encoding is not None:
-            self.encoding = self.encoding.strip()        
-            #Spec violation that complies with hsivonen + mjs
-            if (ascii_punctuation_re.sub("", self.encoding) in
-                ("utf16", "utf16be", "utf16le",
-                 "utf32", "utf32be", "utf32le")):
-                self.encoding = "utf-8"
 
         return self.encoding
 
@@ -666,11 +709,12 @@ def parse(self):
         except StopIteration:
             return None
 
+
 def codecName(encoding):
     """Return the python codec name corresponding to an encoding or None if the
     string doesn't correspond to a valid encoding."""
-    if (encoding is not None and type(encoding) == types.StringType):
+    if (encoding is not None and type(encoding) in types.StringTypes):
         canonicalName = ascii_punctuation_re.sub("", encoding).lower()
-        return encodings.get(canonicalName, None) 
+        return encodings.get(canonicalName, None)
     else:
         return None