Skip to content

Commit 56c7e58

Browse files
committed
Attempted optimisation of HTMLInputStream. (Reduces overall parsing time by 15-25% in some typical cases.)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401154
1 parent fb146a3 commit 56c7e58

File tree

1 file changed

+63
-45
lines changed

1 file changed

+63
-45
lines changed

src/html5lib/inputstream.py

Lines changed: 63 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,8 @@
1313

1414
invalid_unicode_re = re.compile(u"[\u0001-\u0008]|[\u000E-\u001F]|[\u007F-\u009F]|[\uD800-\uDFFF]|[\uFDD0-\uFDDF]|\uFFFE|\uFFFF|\U0001FFFE|\U0001FFFF|\U0002FFFE|\U0002FFFF|\U0003FFFE|\U0003FFFF|\U0004FFFE|\U0004FFFF|\U0005FFFE|\U0005FFFF|\U0006FFFE|\U0006FFFF|\U0007FFFE|\U0007FFFF|\U0008FFFE|\U0008FFFF|\U0009FFFE|\U0009FFFF|\U000AFFFE|\U000AFFFF|\U000BFFFE\U000BFFFF|\U000CFFFE|\U000CFFFF|\U000DFFFE|\U000DFFFF|\U000EFFFE|\U000EFFFF|\U000FFFFE|\U000FFFFF|\U0010FFFE|\U0010FFFF")
1515

16-
try:
17-
from collections import deque
18-
except ImportError:
19-
from utils import deque
16+
# Cache for charsUntil()
17+
charsUntilRegEx = {}
2018

2119
class HTMLInputStream(object):
2220
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -68,7 +66,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
6866
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
6967
'replace')
7068

71-
self.queue = deque([])
69+
self.chunk = u""
70+
self.chunkOffset = 0
71+
self.ungetBuffer = [] # reversed list of chars from unget()
7272
self.readChars = []
7373
self.errors = []
7474

@@ -247,21 +247,25 @@ def char(self):
247247
""" Read one character from the stream or queue if available. Return
248248
EOF when EOF is reached.
249249
"""
250-
if not self.queue:
251-
self.readChunk()
252-
#If we still don't have a character we have reached EOF
253-
if not self.queue:
254-
return EOF
255-
256-
char = self.queue.popleft()
257-
250+
if self.ungetBuffer:
251+
return self.ungetBuffer.pop()
252+
253+
if self.chunkOffset >= len(self.chunk):
254+
if not self.readChunk():
255+
return EOF
256+
257+
char = self.chunk[self.chunkOffset]
258+
self.chunkOffset += 1
259+
258260
self.readChars.append(char)
259261
return char
260262

261263
def readChunk(self, chunkSize=10240):
262264
data = self.dataStream.read(chunkSize)
263265
if not data:
264-
return
266+
self.chunk = u""
267+
self.chunkOffset = 0
268+
return False
265269
#Replace null characters
266270
for i in xrange(data.count(u"\u0000")):
267271
self.errors.append("null-character")
@@ -275,53 +279,67 @@ def readChunk(self, chunkSize=10240):
275279
self._lastChunkEndsWithCR = data[-1] == "\r"
276280
data = data.replace("\r\n", "\n")
277281
data = data.replace("\r", "\n")
278-
282+
279283
data = unicode(data)
280-
self.queue.extend(list(data))
284+
self.chunk = data
285+
self.chunkOffset = 0
281286

282287
self.updatePosition()
288+
return True
283289

284290
def charsUntil(self, characters, opposite = False):
285291
""" Returns a string of characters from the stream up to but not
286-
including any character in characters or EOF. characters can be
287-
any container that supports the in method being called on it.
292+
including any character in 'characters' or EOF. 'characters' must be
293+
a container that supports the 'in' method and iteration over its
294+
characters.
288295
"""
289296

290-
#This method is currently 40-50% of our total runtime and badly needs
291-
#optimizing
292-
#Possible improvements:
293-
# - use regexp to find characters that match the required character set
294-
# (with regexp cache since we do the same searches many many times)
295-
# - improve EOF handling for fewer if statements
296-
297-
if not self.queue:
298-
self.readChunk()
299-
#Break if we have reached EOF
300-
if not self.queue or self.queue[0] == None:
301-
return u""
302-
303-
i = 0
304-
while (self.queue[i] in characters) == opposite:
305-
i += 1
306-
if i == len(self.queue):
307-
self.readChunk()
308-
#If the queue doesn't grow we have reached EOF
309-
if i == len(self.queue) or self.queue[i] is EOF:
297+
rv = []
298+
299+
# The unget buffer is typically small and rarely used, so
300+
# just check each character individually
301+
while self.ungetBuffer:
302+
if self.ungetBuffer[-1] == EOF or (self.ungetBuffer[-1] in characters) != opposite:
303+
r = u"".join(rv)
304+
self.readChars.extend(list(r))
305+
return r
306+
else:
307+
rv.append(self.ungetBuffer.pop())
308+
309+
# Use a cache of regexps to find the required characters
310+
try:
311+
chars = charsUntilRegEx[characters]
312+
except KeyError:
313+
for c in characters: assert(ord(c) < 128)
314+
regex = u"".join("\\x%02x" % ord(c) for c in characters)
315+
if not opposite:
316+
regex = u"^%s" % regex
317+
chars = charsUntilRegEx[characters] = re.compile(u"[%s]*" % regex)
318+
319+
while True:
320+
# Find the longest matching prefix
321+
m = chars.match(self.chunk, self.chunkOffset)
322+
# If not everything matched, return everything up to the part that didn't match
323+
if m.end() != len(self.chunk):
324+
rv.append(self.chunk[self.chunkOffset:m.end()])
325+
self.chunkOffset = m.end()
326+
break
327+
# If the whole chunk matched, use it all and read the next chunk
328+
rv.append(self.chunk[self.chunkOffset:])
329+
if not self.readChunk():
330+
# Reached EOF
310331
break
311332

312-
rv = [self.queue.popleft() for c in range(i)]
313-
314-
self.readChars.extend(rv)
315-
316-
rv = u"".join(rv)
317-
return rv
333+
r = u"".join(rv)
334+
self.readChars.extend(list(r))
335+
return r
318336

319337
def unget(self, chars):
320338
self.updatePosition()
321339
if chars:
322340
l = list(chars)
323341
l.reverse()
324-
self.queue.extendleft(l)
342+
self.ungetBuffer.extend(l)
325343
#Alter the current line, col position
326344
for c in chars[::-1]:
327345
if c is None:

0 commit comments

Comments
 (0)