@@ -178,8 +178,8 @@ def reset(self):
178
178
# number of columns in the last line of the previous chunk
179
179
self .prevNumCols = 0
180
180
181
- #Flag to indicate we may have a CR LF broken across a data chunk
182
- self ._lastChunkEndsWithCR = False
181
+ #Deal with CR LF and surrogates split over chunk boundaries
182
+ self ._bufferedCharacter = None
183
183
184
184
def openStream (self , source ):
185
185
"""Produces a file object from source.
@@ -344,23 +344,30 @@ def readChunk(self, chunkSize=None):
344
344
self .chunkOffset = 0
345
345
346
346
data = self .dataStream .read (chunkSize )
347
-
348
- if not data :
347
+
348
+ #Deal with CR LF and surrogates broken across chunks
349
+ if self ._bufferedCharacter :
350
+ if data :
351
+ data = data + self ._bufferedCharacter
352
+ else :
353
+ data = self ._bufferedCharacter
354
+ self ._bufferedCharacter = None
355
+ elif not data :
356
+ # We have no more data, bye-bye stream
349
357
return False
350
358
359
+ if len (data ) > 1 :
360
+ lastv = ord (data [- 1 ])
361
+ if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF :
362
+ self ._bufferedCharacter = data [- 1 ]
363
+ data = data [:- 1 ]
364
+
351
365
self .reportCharacterErrors (data )
352
366
353
367
# Replace invalid characters
354
368
data = data .replace (u"\u0000 " , u"\ufffd " )
355
369
data = self .replaceCharactersRegexp .sub (u"\ufffd " , data )
356
-
357
- #Check for CR LF broken across chunks
358
- if (self ._lastChunkEndsWithCR and data [0 ] == u"\n " ):
359
- data = data [1 :]
360
- # Stop if the chunk is now empty
361
- if not data :
362
- return False
363
- self ._lastChunkEndsWithCR = data [- 1 ] == u"\r "
370
+
364
371
data = data .replace (u"\r \n " , u"\n " )
365
372
data = data .replace (u"\r " , u"\n " )
366
373
@@ -400,8 +407,6 @@ def characterErrorsUCS2(self, data):
400
407
else :
401
408
skip = False
402
409
self .errors .append ("invalid-codepoint" )
403
- #This is still wrong if it is possible for a surrogate pair to break a
404
- #chunk boundary
405
410
406
411
def charsUntil (self , characters , opposite = False ):
407
412
""" Returns a string of characters from the stream up to but not
0 commit comments