Calculate new position at end of charsUntil

jgraham · jgraham · commit ede74c676e79 · 2007-07-27T16:40:55.000Z
--HG--
extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40921
diff --git a/src/html5lib/inputstream.py b/src/html5lib/inputstream.py
@@ -255,7 +255,7 @@ def charsUntil(self, characters, opposite = False):
         #optimizing
         #Possible improvements:
         # - use regexp to find characters that match the required character set
-        # - compute line positions in a single pass at the end
+        #   (with regexp cache since we do the same operations many many times)
         # - improve EOF handling for fewer if statements
 
         if not self.queue:
@@ -266,13 +266,6 @@ def charsUntil(self, characters, opposite = False):
         
         i = 0
         while (self.queue[i] in characters) == opposite:
-            #Working out positions like this really sucks
-            if self.queue[i] == '\n':
-                self.lineLengths.append(self.col)
-                self.line += 1
-                self.col = 0
-            else:
-                self.col += 1
             i += 1
             if i == len(self.queue):
                 self.readChunk()
@@ -281,6 +274,32 @@ def charsUntil(self, characters, opposite = False):
                 break
 
         rv = u"".join(self.queue[:i])
+        
+        #Calculate where we now are in the stream
+        #One possible optimisation would be to store all read characters and
+        #Calculate this on an as-needed basis (perhaps flushing the read data
+        #every time we read a new chunk) rather than once per call here and
+        #in .char()
+        lines = rv.split("\n")
+        
+        if lines:
+            #Add number of lines passed onto positon
+            oldCol = self.col
+            self.line += len(lines)-1
+            if len(lines) > 1:
+                self.col = len(lines[-1])
+            else:
+                self.col += len(lines[0])
+
+            if self.lineLengths and oldCol > 0:
+                self.lineLengths[-1] += len(lines[0])
+                lines = lines[1:-1]
+            else:
+                lines = lines[:-1]
+        
+            for line in lines:
+                self.lineLengths.append(len(line))
+
         self.queue = self.queue[i:]
         
         return rv
diff --git a/tests/test_stream.py b/tests/test_stream.py
@@ -37,13 +37,16 @@ def test_utf_16(self):
         self.assertEquals(len(stream.charsUntil(' ',True)),1025)
 
     def test_newlines(self):
-        stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\r\nccc\rdddd")
+        stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\r\nccc\rddddxe")
         self.assertEquals(stream.position(), (1, 0))
         self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
         self.assertEquals(stream.position(), (3,0))
+        self.assertEquals(stream.lineLengths, [1,2])
         self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
         self.assertEquals(stream.position(), (4,4))
         self.assertEquals(stream.lineLengths, [1,2,3])
+        self.assertEquals(stream.charsUntil('e'),u"x")
+        self.assertEquals(stream.position(), (4,5))
 
 def buildTestSuite():
     return unittest.defaultTestLoader.loadTestsFromName(__name__)