@@ -73,8 +73,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
73
73
self .chunkSize = 0
74
74
self .chunkOffset = 0
75
75
self .errors = []
76
- # Single-character buffer to handle 'unget'
77
- self .ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
78
76
79
77
# Remember the current position in the document
80
78
self .positionLine = 1
@@ -257,18 +255,13 @@ def char(self):
257
255
""" Read one character from the stream or queue if available. Return
258
256
EOF when EOF is reached.
259
257
"""
260
- char = self .ungetChar
261
- if char != u"" :
262
- # Use the ungot character, and reset the buffer
263
- self .ungetChar = u""
264
- else :
265
- # Read a new chunk from the input stream if necessary
266
- if self .chunkOffset >= self .chunkSize :
267
- if not self .readChunk ():
268
- return EOF
258
+ # Read a new chunk from the input stream if necessary
259
+ if self .chunkOffset >= self .chunkSize :
260
+ if not self .readChunk ():
261
+ return EOF
269
262
270
- char = self .chunk [self .chunkOffset ]
271
- self .chunkOffset += 1
263
+ char = self .chunk [self .chunkOffset ]
264
+ self .chunkOffset += 1
272
265
273
266
# Update the position attributes
274
267
if char == u"\n " :
@@ -317,18 +310,6 @@ def charsUntil(self, characters, opposite = False):
317
310
characters.
318
311
"""
319
312
320
- rv = []
321
-
322
- # Check the ungot character, if any.
323
- # (Since it's only a single character, don't use the regex here)
324
- char = self .ungetChar
325
- if char != u"" :
326
- if char is EOF or (char in characters ) != opposite :
327
- return u""
328
- else :
329
- rv .append (char )
330
- self .ungetChar = u""
331
-
332
313
# Use a cache of regexps to find the required characters
333
314
try :
334
315
chars = charsUntilRegEx [(characters , opposite )]
@@ -339,6 +320,8 @@ def charsUntil(self, characters, opposite = False):
339
320
regex = u"^%s" % regex
340
321
chars = charsUntilRegEx [(characters , opposite )] = re .compile (u"[%s]+" % regex )
341
322
323
+ rv = []
324
+
342
325
while True :
343
326
# Find the longest matching prefix
344
327
m = chars .match (self .chunk , self .chunkOffset )
@@ -369,21 +352,29 @@ def charsUntil(self, characters, opposite = False):
369
352
def unget (self , char ):
370
353
# Only one character is allowed to be ungotten at once - it must
371
354
# be consumed again before any further call to unget
372
- assert self .ungetChar == u""
373
-
374
- self .ungetChar = char
375
355
376
- # Update the position attributes
377
- if char is None :
378
- pass
379
- elif char == u"\n " :
380
- assert self .positionLine >= 1
381
- assert self .lastLineLength is not None
382
- self .positionLine -= 1
383
- self .positionCol = self .lastLineLength
384
- self .lastLineLength = None
385
- else :
386
- self .positionCol -= 1
356
+ if char is not None :
357
+ if self .chunkOffset == 0 :
358
+ # unget is called quite rarely, so it's a good idea to do
359
+ # more work here if it saves a bit of work in the frequently
360
+ # called char and charsUntil.
361
+ # So, just prepend the ungotten character onto the current
362
+ # chunk:
363
+ self .chunk = char + self .chunk
364
+ self .chunkSize += 1
365
+ else :
366
+ self .chunkOffset -= 1
367
+ assert self .chunk [self .chunkOffset ] == char
368
+
369
+ # Update the position attributes
370
+ if char == u"\n " :
371
+ assert self .positionLine >= 1
372
+ assert self .lastLineLength is not None
373
+ self .positionLine -= 1
374
+ self .positionCol = self .lastLineLength
375
+ self .lastLineLength = None
376
+ else :
377
+ self .positionCol -= 1
387
378
388
379
class EncodingBytes (str ):
389
380
"""String-like object with an assosiated position and various extra methods
0 commit comments