Skip to content

Commit 329cae6

Browse files
committed
Get html5lib passing tests again after 964568c175 on UCS2 Python. This checks explicitly for lone surrogates in the UCS2 case.
1 parent 6b42a4c commit 329cae6

File tree

1 file changed

+7
-4
lines changed

1 file changed

+7
-4
lines changed

html5lib/inputstream.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
134134
#Craziness
135135
if len(u"\U0010FFFF") == 1:
136136
self.reportCharacterErrors = self.characterErrorsUCS4
137+
self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
137138
else:
138139
self.reportCharacterErrors = self.characterErrorsUCS2
139-
140-
self.replaceCharactersRegexp = re.compile(u"[\u0000\uD800-\uDFFF]")
140+
self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
141141

142142
# List of where new lines occur
143143
self.newLines = [0]
@@ -349,8 +349,11 @@ def readChunk(self, chunkSize=None):
349349
return False
350350

351351
self.reportCharacterErrors(data)
352-
353-
data = self.replaceCharactersRegexp.subn(u"\ufffd", data)[0]
352+
353+
# Replace invalid characters
354+
data = data.replace(u"\u0000", u"\ufffd")
355+
data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
356+
354357
#Check for CR LF broken across chunks
355358
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
356359
data = data[1:]

0 commit comments

Comments
 (0)