Skip to content

Commit ede74c6

Browse files
committed
Calculate new position at end of charsUntil
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40921
1 parent b720453 commit ede74c6

File tree

2 files changed

+31
-9
lines changed

2 files changed

+31
-9
lines changed

src/html5lib/inputstream.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ def charsUntil(self, characters, opposite = False):
255255
#optimizing
256256
#Possible improvements:
257257
# - use regexp to find characters that match the required character set
258-
# - compute line positions in a single pass at the end
258+
# (with regexp cache since we do the same operations many many times)
259259
# - improve EOF handling for fewer if statements
260260

261261
if not self.queue:
@@ -266,13 +266,6 @@ def charsUntil(self, characters, opposite = False):
266266

267267
i = 0
268268
while (self.queue[i] in characters) == opposite:
269-
#Working out positions like this really sucks
270-
if self.queue[i] == '\n':
271-
self.lineLengths.append(self.col)
272-
self.line += 1
273-
self.col = 0
274-
else:
275-
self.col += 1
276269
i += 1
277270
if i == len(self.queue):
278271
self.readChunk()
@@ -281,6 +274,32 @@ def charsUntil(self, characters, opposite = False):
281274
break
282275

283276
rv = u"".join(self.queue[:i])
277+
278+
#Calculate where we now are in the stream
279+
#One possible optimisation would be to store all read characters and
280+
#Calculate this on an as-needed basis (perhaps flushing the read data
281+
#every time we read a new chunk) rather than once per call here and
282+
#in .char()
283+
lines = rv.split("\n")
284+
285+
if lines:
286+
#Add number of lines passed onto positon
287+
oldCol = self.col
288+
self.line += len(lines)-1
289+
if len(lines) > 1:
290+
self.col = len(lines[-1])
291+
else:
292+
self.col += len(lines[0])
293+
294+
if self.lineLengths and oldCol > 0:
295+
self.lineLengths[-1] += len(lines[0])
296+
lines = lines[1:-1]
297+
else:
298+
lines = lines[:-1]
299+
300+
for line in lines:
301+
self.lineLengths.append(len(line))
302+
284303
self.queue = self.queue[i:]
285304

286305
return rv

tests/test_stream.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,16 @@ def test_utf_16(self):
3737
self.assertEquals(len(stream.charsUntil(' ',True)),1025)
3838

3939
def test_newlines(self):
40-
stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\r\nccc\rdddd")
40+
stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\r\nccc\rddddxe")
4141
self.assertEquals(stream.position(), (1, 0))
4242
self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
4343
self.assertEquals(stream.position(), (3,0))
44+
self.assertEquals(stream.lineLengths, [1,2])
4445
self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
4546
self.assertEquals(stream.position(), (4,4))
4647
self.assertEquals(stream.lineLengths, [1,2,3])
48+
self.assertEquals(stream.charsUntil('e'),u"x")
49+
self.assertEquals(stream.position(), (4,5))
4750

4851
def buildTestSuite():
4952
return unittest.defaultTestLoader.loadTestsFromName(__name__)

0 commit comments

Comments
 (0)