@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
72
72
self .chunk = u""
73
73
self .chunkSize = 0
74
74
self .chunkOffset = 0
75
- self .ungetBuffer = [] # reversed list of chars from unget()
76
- self .readChars = []
77
75
self .errors = []
78
-
79
- self .lineLengths = []
76
+ # Single-character buffer to handle 'unget'
77
+ self .ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
78
+
79
+ # Remember the current position in the document
80
+ self .positionLine = 1
81
+ self .positionCol = 0
82
+ # Remember the length of the last line, so unget("\n") can restore
83
+ # positionCol. (Only one character can be ungot at once, so we only
84
+ # need to remember the single last line.)
85
+ self .lastLineLength = None
80
86
81
87
#Flag to indicate we may have a CR LF broken across a data chunk
82
88
self ._lastChunkEndsWithCR = False
@@ -219,51 +225,59 @@ def detectEncodingMeta(self):
219
225
encoding = parser .getEncoding ()
220
226
return encoding
221
227
222
- def updatePosition (self ):
223
- #Remove EOF from readChars, if present
224
- if not self .readChars :
225
- return
226
- if self .readChars and self .readChars [- 1 ] == EOF :
227
- #There may be more than one EOF in readChars so we cannot assume
228
- #readChars.index(EOF) == -1
229
- self .readChars = self .readChars [:self .readChars .index (EOF )]
230
- readChars = "" .join (self .readChars )
231
- lines = readChars .split ("\n " )
232
- if self .lineLengths :
233
- self .lineLengths [- 1 ] += len (lines [0 ])
228
+ def updatePosition (self , chars ):
229
+ # Update the position attributes to correspond to some sequence of
230
+ # read characters
231
+
232
+ # Find the last newline character
233
+ idx = chars .rfind (u"\n " )
234
+ if idx == - 1 :
235
+ # No newlines in chars
236
+ self .positionCol += len (chars )
234
237
else :
235
- self .lineLengths .append (len (lines [0 ]))
236
- for line in lines [1 :]:
237
- self .lineLengths .append (len (line ))
238
- self .readChars = []
239
- #print self.lineLengths
238
+ # Find the last-but-one newline character
239
+ idx2 = chars .rfind (u"\n " , 0 , idx )
240
+ if idx2 == - 1 :
241
+ # Only one newline in chars
242
+ self .positionLine += 1
243
+ self .lastLineLength = self .positionCol + idx
244
+ self .positionCol = len (chars ) - (idx + 1 )
245
+ else :
246
+ # At least two newlines in chars
247
+ newlines = chars .count (u"\n " )
248
+ self .positionLine += newlines
249
+ self .lastLineLength = idx - (idx2 + 1 )
250
+ self .positionCol = len (chars ) - (idx + 1 )
240
251
241
252
def position (self ):
242
253
"""Returns (line, col) of the current position in the stream."""
243
- self .updatePosition ()
244
- if self .lineLengths :
245
- line , col = len (self .lineLengths ), self .lineLengths [- 1 ]
246
- else :
247
- line , col = 1 ,0
248
- return (line , col )
254
+ return (self .positionLine , self .positionCol )
249
255
250
256
def char (self ):
251
257
""" Read one character from the stream or queue if available. Return
252
258
EOF when EOF is reached.
253
259
"""
254
- if self .ungetBuffer :
255
- char = self .ungetBuffer .pop ()
256
- self .readChars .append (char )
257
- return char
258
-
259
- if self .chunkOffset >= self .chunkSize :
260
- if not self .readChunk ():
261
- return EOF
262
-
263
- char = self .chunk [self .chunkOffset ]
264
- self .chunkOffset += 1
260
+ char = self .ungetChar
261
+ if char != u"" :
262
+ # Use the ungot character, and reset the buffer
263
+ self .ungetChar = u""
264
+ else :
265
+ # Read a new chunk from the input stream if necessary
266
+ if self .chunkOffset >= self .chunkSize :
267
+ if not self .readChunk ():
268
+ return EOF
269
+
270
+ char = self .chunk [self .chunkOffset ]
271
+ self .chunkOffset += 1
272
+
273
+ # Update the position attributes
274
+ if char == u"\n " :
275
+ self .lastLineLength = self .positionCol
276
+ self .positionCol = 0
277
+ self .positionLine += 1
278
+ elif char is not EOF :
279
+ self .positionCol += 1
265
280
266
- self .readChars .append (char )
267
281
return char
268
282
269
283
def readChunk (self , chunkSize = _defaultChunkSize ):
@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):
282
296
283
297
data = data .replace (u"\u0000 " , u"\ufffd " )
284
298
#Check for CR LF broken across chunks
285
- if (self ._lastChunkEndsWithCR and data [0 ] == "\n " ):
299
+ if (self ._lastChunkEndsWithCR and data [0 ] == u "\n " ):
286
300
data = data [1 :]
287
301
# Stop if the chunk is now empty
288
302
if not data :
289
303
return False
290
- self ._lastChunkEndsWithCR = data [- 1 ] == "\r "
291
- data = data .replace ("\r \n " , "\n " )
292
- data = data .replace ("\r " , "\n " )
304
+ self ._lastChunkEndsWithCR = data [- 1 ] == u "\r "
305
+ data = data .replace (u "\r \n " , u "\n " )
306
+ data = data .replace (u "\r " , u "\n " )
293
307
294
- data = unicode (data )
295
308
self .chunk = data
296
309
self .chunkSize = len (data )
297
310
298
- self .updatePosition ()
299
311
return True
300
312
301
313
def charsUntil (self , characters , opposite = False ):
@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):
307
319
308
320
rv = []
309
321
310
- # The unget buffer is typically small and rarely used, so
311
- # just check each character individually
312
- while self .ungetBuffer :
313
- if self .ungetBuffer [- 1 ] == EOF or (self .ungetBuffer [- 1 ] in characters ) != opposite :
314
- r = u"" .join (rv )
315
- self .readChars .extend (list (r ))
316
- return r
322
+ # Check the ungot character, if any.
323
+ # (Since it's only a single character, don't use the regex here)
324
+ char = self .ungetChar
325
+ if char != u"" :
326
+ if char is EOF or (char in characters ) != opposite :
327
+ return u""
317
328
else :
318
- rv .append (self .ungetBuffer .pop ())
329
+ rv .append (char )
330
+ self .ungetChar = u""
319
331
320
332
# Use a cache of regexps to find the required characters
321
333
try :
322
334
chars = charsUntilRegEx [(characters , opposite )]
323
335
except KeyError :
324
336
for c in characters : assert (ord (c ) < 128 )
325
- regex = u"" .join (["\\ x%02x" % ord (c ) for c in characters ])
337
+ regex = u"" .join ([u "\\ x%02x" % ord (c ) for c in characters ])
326
338
if not opposite :
327
339
regex = u"^%s" % regex
328
340
chars = charsUntilRegEx [(characters , opposite )] = re .compile (u"[%s]*" % regex )
@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):
343
355
break
344
356
345
357
r = u"" .join (rv )
346
- self .readChars . extend ( list ( r ) )
358
+ self .updatePosition ( r )
347
359
return r
348
360
349
- def unget (self , chars ):
350
- self .updatePosition ()
351
- if chars :
352
- l = list (chars )
353
- l .reverse ()
354
- self .ungetBuffer .extend (l )
355
- #Alter the current line, col position
356
- for c in chars [::- 1 ]:
357
- if c is None :
358
- continue
359
- elif c == '\n ' :
360
- assert self .lineLengths [- 1 ] == 0
361
- self .lineLengths .pop ()
362
- else :
363
- self .lineLengths [- 1 ] -= 1
361
+ def unget (self , char ):
362
+ # Only one character is allowed to be ungotten at once - it must
363
+ # be consumed again before any further call to unget
364
+ assert self .ungetChar == u""
365
+
366
+ self .ungetChar = char
367
+
368
+ # Update the position attributes
369
+ if char is None :
370
+ pass
371
+ elif char == u"\n " :
372
+ assert self .positionLine >= 1
373
+ assert self .lastLineLength is not None
374
+ self .positionLine -= 1
375
+ self .positionCol = self .lastLineLength
376
+ self .lastLineLength = None
377
+ else :
378
+ self .positionCol -= 1
364
379
365
380
class EncodingBytes (str ):
366
381
"""String-like object with an assosiated position and various extra methods
0 commit comments