Skip to content

Commit 15d9b0e

Browse files
committed
Deal with surrogates broken over chunk boundaries.
1 parent 329cae6 commit 15d9b0e

File tree

1 file changed

+19
-14
lines changed

1 file changed

+19
-14
lines changed

html5lib/inputstream.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ def reset(self):
178178
# number of columns in the last line of the previous chunk
179179
self.prevNumCols = 0
180180

181-
#Flag to indicate we may have a CR LF broken across a data chunk
182-
self._lastChunkEndsWithCR = False
181+
#Deal with CR LF and surrogates split over chunk boundaries
182+
self._bufferedCharacter = None
183183

184184
def openStream(self, source):
185185
"""Produces a file object from source.
@@ -344,23 +344,30 @@ def readChunk(self, chunkSize=None):
344344
self.chunkOffset = 0
345345

346346
data = self.dataStream.read(chunkSize)
347-
348-
if not data:
347+
348+
#Deal with CR LF and surrogates broken across chunks
349+
if self._bufferedCharacter:
350+
if data:
351+
data = data + self._bufferedCharacter
352+
else:
353+
data = self._bufferedCharacter
354+
self._bufferedCharacter = None
355+
elif not data:
356+
# We have no more data, bye-bye stream
349357
return False
350358

359+
if len(data) > 1:
360+
lastv = ord(data[-1])
361+
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
362+
self._bufferedCharacter = data[-1]
363+
data = data[:-1]
364+
351365
self.reportCharacterErrors(data)
352366

353367
# Replace invalid characters
354368
data = data.replace(u"\u0000", u"\ufffd")
355369
data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
356-
357-
#Check for CR LF broken across chunks
358-
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
359-
data = data[1:]
360-
# Stop if the chunk is now empty
361-
if not data:
362-
return False
363-
self._lastChunkEndsWithCR = data[-1] == u"\r"
370+
364371
data = data.replace(u"\r\n", u"\n")
365372
data = data.replace(u"\r", u"\n")
366373

@@ -400,8 +407,6 @@ def characterErrorsUCS2(self, data):
400407
else:
401408
skip = False
402409
self.errors.append("invalid-codepoint")
403-
#This is still wrong if it is possible for a surrogate pair to break a
404-
#chunk boundary
405410

406411
def charsUntil(self, characters, opposite = False):
407412
""" Returns a string of characters from the stream up to but not

0 commit comments

Comments
 (0)