Skip to content

Commit 0d0282b

Browse files
committed
Simplified unget code (and improved speed by ~2%)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401246
1 parent 22886b1 commit 0d0282b

File tree

1 file changed

+30
-39
lines changed

1 file changed

+30
-39
lines changed

src/html5lib/inputstream.py

Lines changed: 30 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7373
self.chunkSize = 0
7474
self.chunkOffset = 0
7575
self.errors = []
76-
# Single-character buffer to handle 'unget'
77-
self.ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
7876

7977
# Remember the current position in the document
8078
self.positionLine = 1
@@ -257,18 +255,13 @@ def char(self):
257255
""" Read one character from the stream or queue if available. Return
258256
EOF when EOF is reached.
259257
"""
260-
char = self.ungetChar
261-
if char != u"":
262-
# Use the ungot character, and reset the buffer
263-
self.ungetChar = u""
264-
else:
265-
# Read a new chunk from the input stream if necessary
266-
if self.chunkOffset >= self.chunkSize:
267-
if not self.readChunk():
268-
return EOF
258+
# Read a new chunk from the input stream if necessary
259+
if self.chunkOffset >= self.chunkSize:
260+
if not self.readChunk():
261+
return EOF
269262

270-
char = self.chunk[self.chunkOffset]
271-
self.chunkOffset += 1
263+
char = self.chunk[self.chunkOffset]
264+
self.chunkOffset += 1
272265

273266
# Update the position attributes
274267
if char == u"\n":
@@ -317,18 +310,6 @@ def charsUntil(self, characters, opposite = False):
317310
characters.
318311
"""
319312

320-
rv = []
321-
322-
# Check the ungot character, if any.
323-
# (Since it's only a single character, don't use the regex here)
324-
char = self.ungetChar
325-
if char != u"":
326-
if char is EOF or (char in characters) != opposite:
327-
return u""
328-
else:
329-
rv.append(char)
330-
self.ungetChar = u""
331-
332313
# Use a cache of regexps to find the required characters
333314
try:
334315
chars = charsUntilRegEx[(characters, opposite)]
@@ -339,6 +320,8 @@ def charsUntil(self, characters, opposite = False):
339320
regex = u"^%s" % regex
340321
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
341322

323+
rv = []
324+
342325
while True:
343326
# Find the longest matching prefix
344327
m = chars.match(self.chunk, self.chunkOffset)
@@ -369,21 +352,29 @@ def charsUntil(self, characters, opposite = False):
369352
def unget(self, char):
370353
# Only one character is allowed to be ungotten at once - it must
371354
# be consumed again before any further call to unget
372-
assert self.ungetChar == u""
373-
374-
self.ungetChar = char
375355

376-
# Update the position attributes
377-
if char is None:
378-
pass
379-
elif char == u"\n":
380-
assert self.positionLine >= 1
381-
assert self.lastLineLength is not None
382-
self.positionLine -= 1
383-
self.positionCol = self.lastLineLength
384-
self.lastLineLength = None
385-
else:
386-
self.positionCol -= 1
356+
if char is not None:
357+
if self.chunkOffset == 0:
358+
# unget is called quite rarely, so it's a good idea to do
359+
# more work here if it saves a bit of work in the frequently
360+
# called char and charsUntil.
361+
# So, just prepend the ungotten character onto the current
362+
# chunk:
363+
self.chunk = char + self.chunk
364+
self.chunkSize += 1
365+
else:
366+
self.chunkOffset -= 1
367+
assert self.chunk[self.chunkOffset] == char
368+
369+
# Update the position attributes
370+
if char == u"\n":
371+
assert self.positionLine >= 1
372+
assert self.lastLineLength is not None
373+
self.positionLine -= 1
374+
self.positionCol = self.lastLineLength
375+
self.lastLineLength = None
376+
else:
377+
self.positionCol -= 1
387378

388379
class EncodingBytes(str):
389380
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)