Don't update the stream position each time a character is read

jgraham · jgraham · commit 0c6d178f90dc · 2007-09-23T22:52:23.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401015
diff --git a/src/html5lib/inputstream.py b/src/html5lib/inputstream.py
@@ -61,9 +61,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
                                                               'replace')
 
         self.queue = deque([])
+        self.readChars = []
         self.errors = []
 
-        self.line = self.col = 0
         self.lineLengths = []
         
         #Flag to indicate we may have a CR LF broken across a data chunk
@@ -202,10 +202,33 @@ def detectEncodingMeta(self):
         self.seek(buffer, 0)
         return parser.getEncoding()
 
+    def updatePosition(self):
+        #Remove EOF from readChars, if present
+        if not self.readChars:
+            return
+        if self.readChars and self.readChars[-1] == EOF:
+            #There may be more than one EOF in readChars so we cannot assume
+            #readChars.index(EOF) == -1
+            self.readChars = self.readChars[:self.readChars.index(EOF)]
+        readChars = "".join(self.readChars)
+        lines = readChars.split("\n")
+        if self.lineLengths:
+            self.lineLengths[-1] += len(lines[0])
+        else:
+            self.lineLengths.append(len(lines[0]))
+        for line in lines[1:]:
+            self.lineLengths.append(len(line))
+        self.readChars = []
+        #print self.lineLengths
+
     def position(self):
         """Returns (line, col) of the current position in the stream."""
-        line, col = self.line, self.col
-        return (line + 1, col)
+        self.updatePosition()
+        if self.lineLengths:
+            line, col = len(self.lineLengths), self.lineLengths[-1]
+        else:
+            line, col = 1,0
+        return (line, col)
 
     def char(self):
         """ Read one character from the stream or queue if available. Return
@@ -219,13 +242,7 @@ def char(self):
         
         char = self.queue.popleft()
         
-        # update position in stream
-        if char == '\n':
-            self.lineLengths.append(self.col)
-            self.line += 1
-            self.col = 0
-        else:
-            self.col += 1
+        self.readChars.append(char)
         return char
 
     def readChunk(self, chunkSize=10240):
@@ -246,6 +263,8 @@ def readChunk(self, chunkSize=10240):
         data = unicode(data)
         self.queue.extend([char for char in data])
 
+        self.updatePosition()
+
     def charsUntil(self, characters, opposite = False):
         """ Returns a string of characters from the stream up to but not
         including any character in characters or EOF. characters can be
@@ -273,60 +292,27 @@ def charsUntil(self, characters, opposite = False):
             #If the queue doesn't grow we have reached EOF
             if i == len(self.queue) or self.queue[i] is EOF:
                 break
-            #XXX- wallpaper over bug in calculation below
-            #Otherwise change the stream position
-            if self.queue[i] == '\n':
-                self.lineLengths.append(self.col)
-                self.line += 1
-                self.col = 0
-            else:
-                self.col += 1
 
-        rv = u"".join([ self.queue.popleft() for c in range(i) ])
-        
-        #Calculate where we now are in the stream
-        #One possible optimisation would be to store all read characters and
-        #Calculate this on an as-needed basis (perhaps flushing the read data
-        #every time we read a new chunk) rather than once per call here and
-        #in .char()
-        
-        #XXX Temporarily disable this because there is a bug
+        rv = [self.queue.popleft() for c in range(i)]
         
-        #lines = rv.split("\n")
-        #
-        #if lines:
-        #    #Add number of lines passed onto positon
-        #    oldCol = self.col
-        #    self.line += len(lines)-1
-        #    if len(lines) > 1:
-        #        self.col = len(lines[-1])
-        #    else:
-        #        self.col += len(lines[0])
-        #
-        #    if self.lineLengths and oldCol > 0:
-        #        self.lineLengths[-1] += len(lines[0])
-        #        lines = lines[1:-1]
-        #    else:
-        #        lines = lines[:-1]
-        #
-        #    for line in lines:
-        #        self.lineLengths.append(len(line))
-        #
+        self.readChars.extend(rv)
         
+        rv = u"".join(rv)
         return rv
 
     def unget(self, chars):
+        self.updatePosition()
         if chars:
             l = list(chars)
             l.reverse()
             self.queue.extendleft(l)
             #Alter the current line, col position
             for c in chars[::-1]:
                 if c == '\n':
-                    self.line -= 1
-                    self.col = self.lineLengths[self.line]
+                    assert self.lineLengths[-1] == 0
+                    self.lineLengths.pop()
                 else:
-                    self.col -= 1
+                    self.lineLengths[-1] -= 1
 
 class EncodingBytes(str):
     """String-like object with an assosiated position and various extra methods
diff --git a/tests/test_stream.py b/tests/test_stream.py
@@ -41,13 +41,23 @@ def test_newlines(self):
         self.assertEquals(stream.position(), (1, 0))
         self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
         self.assertEquals(stream.position(), (3,0))
-        self.assertEquals(stream.lineLengths, [1,2])
         self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
         self.assertEquals(stream.position(), (4,4))
-        self.assertEquals(stream.lineLengths, [1,2,3])
         self.assertEquals(stream.charsUntil('e'),u"x")
         self.assertEquals(stream.position(), (4,5))
 
+    def test_position(self):
+        stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\nccc\nddd")
+        self.assertEquals(stream.position(), (1, 0))
+        self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
+        self.assertEquals(stream.position(), (3, 0))
+        stream.unget("a\nbb\n")
+        self.assertEquals(stream.position(), (1, 0))
+        self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
+        self.assertEquals(stream.position(), (3, 0))
+	self.assertEquals(stream.charsUntil('e'),u"ccc\nddd")
+        self.assertEquals(stream.position(), (4, 3))
+
 def buildTestSuite():
     return unittest.defaultTestLoader.loadTestsFromName(__name__)