Skip to content

Commit ce43212

Browse files
committed
Rejiggered the tokeniser so it only ever unconsumes a single character. Simplified the line/column position counters. (Saves about 5% parsing time.)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401241
1 parent b7c7de7 commit ce43212

File tree

3 files changed

+268
-192
lines changed

3 files changed

+268
-192
lines changed

src/html5lib/inputstream.py

Lines changed: 85 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7272
self.chunk = u""
7373
self.chunkSize = 0
7474
self.chunkOffset = 0
75-
self.ungetBuffer = [] # reversed list of chars from unget()
76-
self.readChars = []
7775
self.errors = []
78-
79-
self.lineLengths = []
76+
# Single-character buffer to handle 'unget'
77+
self.ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
78+
79+
# Remember the current position in the document
80+
self.positionLine = 1
81+
self.positionCol = 0
82+
# Remember the length of the last line, so unget("\n") can restore
83+
# positionCol. (Only one character can be ungot at once, so we only
84+
# need to remember the single last line.)
85+
self.lastLineLength = None
8086

8187
#Flag to indicate we may have a CR LF broken across a data chunk
8288
self._lastChunkEndsWithCR = False
@@ -219,51 +225,59 @@ def detectEncodingMeta(self):
219225
encoding = parser.getEncoding()
220226
return encoding
221227

222-
def updatePosition(self):
223-
#Remove EOF from readChars, if present
224-
if not self.readChars:
225-
return
226-
if self.readChars and self.readChars[-1] == EOF:
227-
#There may be more than one EOF in readChars so we cannot assume
228-
#readChars.index(EOF) == -1
229-
self.readChars = self.readChars[:self.readChars.index(EOF)]
230-
readChars = "".join(self.readChars)
231-
lines = readChars.split("\n")
232-
if self.lineLengths:
233-
self.lineLengths[-1] += len(lines[0])
228+
def updatePosition(self, chars):
229+
# Update the position attributes to correspond to some sequence of
230+
# read characters
231+
232+
# Find the last newline character
233+
idx = chars.rfind(u"\n")
234+
if idx == -1:
235+
# No newlines in chars
236+
self.positionCol += len(chars)
234237
else:
235-
self.lineLengths.append(len(lines[0]))
236-
for line in lines[1:]:
237-
self.lineLengths.append(len(line))
238-
self.readChars = []
239-
#print self.lineLengths
238+
# Find the last-but-one newline character
239+
idx2 = chars.rfind(u"\n", 0, idx)
240+
if idx2 == -1:
241+
# Only one newline in chars
242+
self.positionLine += 1
243+
self.lastLineLength = self.positionCol + idx
244+
self.positionCol = len(chars) - (idx + 1)
245+
else:
246+
# At least two newlines in chars
247+
newlines = chars.count(u"\n")
248+
self.positionLine += newlines
249+
self.lastLineLength = idx - (idx2 + 1)
250+
self.positionCol = len(chars) - (idx + 1)
240251

241252
def position(self):
242253
"""Returns (line, col) of the current position in the stream."""
243-
self.updatePosition()
244-
if self.lineLengths:
245-
line, col = len(self.lineLengths), self.lineLengths[-1]
246-
else:
247-
line, col = 1,0
248-
return (line, col)
254+
return (self.positionLine, self.positionCol)
249255

250256
def char(self):
251257
""" Read one character from the stream or queue if available. Return
252258
EOF when EOF is reached.
253259
"""
254-
if self.ungetBuffer:
255-
char = self.ungetBuffer.pop()
256-
self.readChars.append(char)
257-
return char
258-
259-
if self.chunkOffset >= self.chunkSize:
260-
if not self.readChunk():
261-
return EOF
262-
263-
char = self.chunk[self.chunkOffset]
264-
self.chunkOffset += 1
260+
char = self.ungetChar
261+
if char != u"":
262+
# Use the ungot character, and reset the buffer
263+
self.ungetChar = u""
264+
else:
265+
# Read a new chunk from the input stream if necessary
266+
if self.chunkOffset >= self.chunkSize:
267+
if not self.readChunk():
268+
return EOF
269+
270+
char = self.chunk[self.chunkOffset]
271+
self.chunkOffset += 1
272+
273+
# Update the position attributes
274+
if char == u"\n":
275+
self.lastLineLength = self.positionCol
276+
self.positionCol = 0
277+
self.positionLine += 1
278+
elif char is not EOF:
279+
self.positionCol += 1
265280

266-
self.readChars.append(char)
267281
return char
268282

269283
def readChunk(self, chunkSize=_defaultChunkSize):
@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):
282296

283297
data = data.replace(u"\u0000", u"\ufffd")
284298
#Check for CR LF broken across chunks
285-
if (self._lastChunkEndsWithCR and data[0] == "\n"):
299+
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
286300
data = data[1:]
287301
# Stop if the chunk is now empty
288302
if not data:
289303
return False
290-
self._lastChunkEndsWithCR = data[-1] == "\r"
291-
data = data.replace("\r\n", "\n")
292-
data = data.replace("\r", "\n")
304+
self._lastChunkEndsWithCR = data[-1] == u"\r"
305+
data = data.replace(u"\r\n", u"\n")
306+
data = data.replace(u"\r", u"\n")
293307

294-
data = unicode(data)
295308
self.chunk = data
296309
self.chunkSize = len(data)
297310

298-
self.updatePosition()
299311
return True
300312

301313
def charsUntil(self, characters, opposite = False):
@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):
307319

308320
rv = []
309321

310-
# The unget buffer is typically small and rarely used, so
311-
# just check each character individually
312-
while self.ungetBuffer:
313-
if self.ungetBuffer[-1] == EOF or (self.ungetBuffer[-1] in characters) != opposite:
314-
r = u"".join(rv)
315-
self.readChars.extend(list(r))
316-
return r
322+
# Check the ungot character, if any.
323+
# (Since it's only a single character, don't use the regex here)
324+
char = self.ungetChar
325+
if char != u"":
326+
if char is EOF or (char in characters) != opposite:
327+
return u""
317328
else:
318-
rv.append(self.ungetBuffer.pop())
329+
rv.append(char)
330+
self.ungetChar = u""
319331

320332
# Use a cache of regexps to find the required characters
321333
try:
322334
chars = charsUntilRegEx[(characters, opposite)]
323335
except KeyError:
324336
for c in characters: assert(ord(c) < 128)
325-
regex = u"".join(["\\x%02x" % ord(c) for c in characters])
337+
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
326338
if not opposite:
327339
regex = u"^%s" % regex
328340
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]*" % regex)
@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):
343355
break
344356

345357
r = u"".join(rv)
346-
self.readChars.extend(list(r))
358+
self.updatePosition(r)
347359
return r
348360

349-
def unget(self, chars):
350-
self.updatePosition()
351-
if chars:
352-
l = list(chars)
353-
l.reverse()
354-
self.ungetBuffer.extend(l)
355-
#Alter the current line, col position
356-
for c in chars[::-1]:
357-
if c is None:
358-
continue
359-
elif c == '\n':
360-
assert self.lineLengths[-1] == 0
361-
self.lineLengths.pop()
362-
else:
363-
self.lineLengths[-1] -= 1
361+
def unget(self, char):
362+
# Only one character is allowed to be ungotten at once - it must
363+
# be consumed again before any further call to unget
364+
assert self.ungetChar == u""
365+
366+
self.ungetChar = char
367+
368+
# Update the position attributes
369+
if char is None:
370+
pass
371+
elif char == u"\n":
372+
assert self.positionLine >= 1
373+
assert self.lastLineLength is not None
374+
self.positionLine -= 1
375+
self.positionCol = self.lastLineLength
376+
self.lastLineLength = None
377+
else:
378+
self.positionCol -= 1
364379

365380
class EncodingBytes(str):
366381
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)