Skip to content

Commit 0c6d178

Browse files
committed
Don't update the stream position each time a character is read
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401015
1 parent 13f7c37 commit 0c6d178

File tree

2 files changed

+48
-52
lines changed

2 files changed

+48
-52
lines changed

src/html5lib/inputstream.py

Lines changed: 36 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
6161
'replace')
6262

6363
self.queue = deque([])
64+
self.readChars = []
6465
self.errors = []
6566

66-
self.line = self.col = 0
6767
self.lineLengths = []
6868

6969
#Flag to indicate we may have a CR LF broken across a data chunk
@@ -202,10 +202,33 @@ def detectEncodingMeta(self):
202202
self.seek(buffer, 0)
203203
return parser.getEncoding()
204204

205+
def updatePosition(self):
206+
#Remove EOF from readChars, if present
207+
if not self.readChars:
208+
return
209+
if self.readChars and self.readChars[-1] == EOF:
210+
#There may be more than one EOF in readChars so we cannot assume
211+
#readChars.index(EOF) == -1
212+
self.readChars = self.readChars[:self.readChars.index(EOF)]
213+
readChars = "".join(self.readChars)
214+
lines = readChars.split("\n")
215+
if self.lineLengths:
216+
self.lineLengths[-1] += len(lines[0])
217+
else:
218+
self.lineLengths.append(len(lines[0]))
219+
for line in lines[1:]:
220+
self.lineLengths.append(len(line))
221+
self.readChars = []
222+
#print self.lineLengths
223+
205224
def position(self):
206225
"""Returns (line, col) of the current position in the stream."""
207-
line, col = self.line, self.col
208-
return (line + 1, col)
226+
self.updatePosition()
227+
if self.lineLengths:
228+
line, col = len(self.lineLengths), self.lineLengths[-1]
229+
else:
230+
line, col = 1,0
231+
return (line, col)
209232

210233
def char(self):
211234
""" Read one character from the stream or queue if available. Return
@@ -219,13 +242,7 @@ def char(self):
219242

220243
char = self.queue.popleft()
221244

222-
# update position in stream
223-
if char == '\n':
224-
self.lineLengths.append(self.col)
225-
self.line += 1
226-
self.col = 0
227-
else:
228-
self.col += 1
245+
self.readChars.append(char)
229246
return char
230247

231248
def readChunk(self, chunkSize=10240):
@@ -246,6 +263,8 @@ def readChunk(self, chunkSize=10240):
246263
data = unicode(data)
247264
self.queue.extend([char for char in data])
248265

266+
self.updatePosition()
267+
249268
def charsUntil(self, characters, opposite = False):
250269
""" Returns a string of characters from the stream up to but not
251270
including any character in characters or EOF. characters can be
@@ -273,60 +292,27 @@ def charsUntil(self, characters, opposite = False):
273292
#If the queue doesn't grow we have reached EOF
274293
if i == len(self.queue) or self.queue[i] is EOF:
275294
break
276-
#XXX- wallpaper over bug in calculation below
277-
#Otherwise change the stream position
278-
if self.queue[i] == '\n':
279-
self.lineLengths.append(self.col)
280-
self.line += 1
281-
self.col = 0
282-
else:
283-
self.col += 1
284295

285-
rv = u"".join([ self.queue.popleft() for c in range(i) ])
286-
287-
#Calculate where we now are in the stream
288-
#One possible optimisation would be to store all read characters and
289-
#Calculate this on an as-needed basis (perhaps flushing the read data
290-
#every time we read a new chunk) rather than once per call here and
291-
#in .char()
292-
293-
#XXX Temporarily disable this because there is a bug
296+
rv = [self.queue.popleft() for c in range(i)]
294297

295-
#lines = rv.split("\n")
296-
#
297-
#if lines:
298-
# #Add number of lines passed onto positon
299-
# oldCol = self.col
300-
# self.line += len(lines)-1
301-
# if len(lines) > 1:
302-
# self.col = len(lines[-1])
303-
# else:
304-
# self.col += len(lines[0])
305-
#
306-
# if self.lineLengths and oldCol > 0:
307-
# self.lineLengths[-1] += len(lines[0])
308-
# lines = lines[1:-1]
309-
# else:
310-
# lines = lines[:-1]
311-
#
312-
# for line in lines:
313-
# self.lineLengths.append(len(line))
314-
#
298+
self.readChars.extend(rv)
315299

300+
rv = u"".join(rv)
316301
return rv
317302

318303
def unget(self, chars):
304+
self.updatePosition()
319305
if chars:
320306
l = list(chars)
321307
l.reverse()
322308
self.queue.extendleft(l)
323309
#Alter the current line, col position
324310
for c in chars[::-1]:
325311
if c == '\n':
326-
self.line -= 1
327-
self.col = self.lineLengths[self.line]
312+
assert self.lineLengths[-1] == 0
313+
self.lineLengths.pop()
328314
else:
329-
self.col -= 1
315+
self.lineLengths[-1] -= 1
330316

331317
class EncodingBytes(str):
332318
"""String-like object with an assosiated position and various extra methods

tests/test_stream.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,23 @@ def test_newlines(self):
4141
self.assertEquals(stream.position(), (1, 0))
4242
self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
4343
self.assertEquals(stream.position(), (3,0))
44-
self.assertEquals(stream.lineLengths, [1,2])
4544
self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
4645
self.assertEquals(stream.position(), (4,4))
47-
self.assertEquals(stream.lineLengths, [1,2,3])
4846
self.assertEquals(stream.charsUntil('e'),u"x")
4947
self.assertEquals(stream.position(), (4,5))
5048

49+
def test_position(self):
50+
stream = HTMLInputStream(codecs.BOM_UTF8 + "a\nbb\nccc\nddd")
51+
self.assertEquals(stream.position(), (1, 0))
52+
self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
53+
self.assertEquals(stream.position(), (3, 0))
54+
stream.unget("a\nbb\n")
55+
self.assertEquals(stream.position(), (1, 0))
56+
self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
57+
self.assertEquals(stream.position(), (3, 0))
58+
self.assertEquals(stream.charsUntil('e'),u"ccc\nddd")
59+
self.assertEquals(stream.position(), (4, 3))
60+
5161
def buildTestSuite():
5262
return unittest.defaultTestLoader.loadTestsFromName(__name__)
5363

0 commit comments

Comments
 (0)