13
13
14
14
invalid_unicode_re = re .compile (u"[\u0001 -\u0008 ]|[\u000E -\u001F ]|[\u007F -\u009F ]|[\uD800 -\uDFFF ]|[\uFDD0 -\uFDDF ]|\uFFFE |\uFFFF |\U0001FFFE |\U0001FFFF |\U0002FFFE |\U0002FFFF |\U0003FFFE |\U0003FFFF |\U0004FFFE |\U0004FFFF |\U0005FFFE |\U0005FFFF |\U0006FFFE |\U0006FFFF |\U0007FFFE |\U0007FFFF |\U0008FFFE |\U0008FFFF |\U0009FFFE |\U0009FFFF |\U000AFFFE |\U000AFFFF |\U000BFFFE \U000BFFFF |\U000CFFFE |\U000CFFFF |\U000DFFFE |\U000DFFFF |\U000EFFFE |\U000EFFFF |\U000FFFFE |\U000FFFFF |\U0010FFFE |\U0010FFFF " )
15
15
16
- try :
17
- from collections import deque
18
- except ImportError :
19
- from utils import deque
16
+ # Cache for charsUntil()
17
+ charsUntilRegEx = {}
20
18
21
19
class HTMLInputStream (object ):
22
20
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -68,7 +66,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
68
66
self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
69
67
'replace' )
70
68
71
- self .queue = deque ([])
69
+ self .chunk = u""
70
+ self .chunkOffset = 0
71
+ self .ungetBuffer = [] # reversed list of chars from unget()
72
72
self .readChars = []
73
73
self .errors = []
74
74
@@ -247,21 +247,25 @@ def char(self):
247
247
""" Read one character from the stream or queue if available. Return
248
248
EOF when EOF is reached.
249
249
"""
250
- if not self .queue :
251
- self .readChunk ()
252
- #If we still don't have a character we have reached EOF
253
- if not self .queue :
254
- return EOF
255
-
256
- char = self .queue .popleft ()
257
-
250
+ if self .ungetBuffer :
251
+ return self .ungetBuffer .pop ()
252
+
253
+ if self .chunkOffset >= len (self .chunk ):
254
+ if not self .readChunk ():
255
+ return EOF
256
+
257
+ char = self .chunk [self .chunkOffset ]
258
+ self .chunkOffset += 1
259
+
258
260
self .readChars .append (char )
259
261
return char
260
262
261
263
def readChunk (self , chunkSize = 10240 ):
262
264
data = self .dataStream .read (chunkSize )
263
265
if not data :
264
- return
266
+ self .chunk = u""
267
+ self .chunkOffset = 0
268
+ return False
265
269
#Replace null characters
266
270
for i in xrange (data .count (u"\u0000 " )):
267
271
self .errors .append ("null-character" )
@@ -275,53 +279,67 @@ def readChunk(self, chunkSize=10240):
275
279
self ._lastChunkEndsWithCR = data [- 1 ] == "\r "
276
280
data = data .replace ("\r \n " , "\n " )
277
281
data = data .replace ("\r " , "\n " )
278
-
282
+
279
283
data = unicode (data )
280
- self .queue .extend (list (data ))
284
+ self .chunk = data
285
+ self .chunkOffset = 0
281
286
282
287
self .updatePosition ()
288
+ return True
283
289
284
290
def charsUntil (self , characters , opposite = False ):
285
291
""" Returns a string of characters from the stream up to but not
286
- including any character in characters or EOF. characters can be
287
- any container that supports the in method being called on it.
292
+ including any character in 'characters' or EOF. 'characters' must be
293
+ a container that supports the 'in' method and iteration over its
294
+ characters.
288
295
"""
289
296
290
- #This method is currently 40-50% of our total runtime and badly needs
291
- #optimizing
292
- #Possible improvements:
293
- # - use regexp to find characters that match the required character set
294
- # (with regexp cache since we do the same searches many many times)
295
- # - improve EOF handling for fewer if statements
296
-
297
- if not self .queue :
298
- self .readChunk ()
299
- #Break if we have reached EOF
300
- if not self .queue or self .queue [0 ] == None :
301
- return u""
302
-
303
- i = 0
304
- while (self .queue [i ] in characters ) == opposite :
305
- i += 1
306
- if i == len (self .queue ):
307
- self .readChunk ()
308
- #If the queue doesn't grow we have reached EOF
309
- if i == len (self .queue ) or self .queue [i ] is EOF :
297
+ rv = []
298
+
299
+ # The unget buffer is typically small and rarely used, so
300
+ # just check each character individually
301
+ while self .ungetBuffer :
302
+ if self .ungetBuffer [- 1 ] == EOF or (self .ungetBuffer [- 1 ] in characters ) != opposite :
303
+ r = u"" .join (rv )
304
+ self .readChars .extend (list (r ))
305
+ return r
306
+ else :
307
+ rv .append (self .ungetBuffer .pop ())
308
+
309
+ # Use a cache of regexps to find the required characters
310
+ try :
311
+ chars = charsUntilRegEx [characters ]
312
+ except KeyError :
313
+ for c in characters : assert (ord (c ) < 128 )
314
+ regex = u"" .join ("\\ x%02x" % ord (c ) for c in characters )
315
+ if not opposite :
316
+ regex = u"^%s" % regex
317
+ chars = charsUntilRegEx [characters ] = re .compile (u"[%s]*" % regex )
318
+
319
+ while True :
320
+ # Find the longest matching prefix
321
+ m = chars .match (self .chunk , self .chunkOffset )
322
+ # If not everything matched, return everything up to the part that didn't match
323
+ if m .end () != len (self .chunk ):
324
+ rv .append (self .chunk [self .chunkOffset :m .end ()])
325
+ self .chunkOffset = m .end ()
326
+ break
327
+ # If the whole chunk matched, use it all and read the next chunk
328
+ rv .append (self .chunk [self .chunkOffset :])
329
+ if not self .readChunk ():
330
+ # Reached EOF
310
331
break
311
332
312
- rv = [self .queue .popleft () for c in range (i )]
313
-
314
- self .readChars .extend (rv )
315
-
316
- rv = u"" .join (rv )
317
- return rv
333
+ r = u"" .join (rv )
334
+ self .readChars .extend (list (r ))
335
+ return r
318
336
319
337
def unget (self , chars ):
320
338
self .updatePosition ()
321
339
if chars :
322
340
l = list (chars )
323
341
l .reverse ()
324
- self .queue . extendleft (l )
342
+ self .ungetBuffer . extend (l )
325
343
#Alter the current line, col position
326
344
for c in chars [::- 1 ]:
327
345
if c is None :
0 commit comments