Skip to content

Commit 55bfda0

Browse files
author
James Graham
committed
Replace unpaired surrogates with replacement characters (unlikely to work on UCS2 python)
1 parent 2f50d60 commit 55bfda0

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

html5lib/inputstream.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
137137
else:
138138
self.reportCharacterErrors = self.characterErrorsUCS2
139139

140+
self.replaceCharactersRegexp = re.compile(u"[\u0000\uD800-\uDFFF]")
141+
140142
# List of where new lines occur
141143
self.newLines = [0]
142144

@@ -159,6 +161,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
159161
if (self.charEncoding[0] is None):
160162
self.charEncoding = self.detectEncoding(parseMeta, chardet)
161163

164+
162165
self.reset()
163166

164167
def reset(self):
@@ -347,7 +350,7 @@ def readChunk(self, chunkSize=None):
347350

348351
self.reportCharacterErrors(data)
349352

350-
data = data.replace(u"\u0000", u"\ufffd")
353+
data = self.replaceCharactersRegexp.subn(u"\ufffd", data)[0]
351354
#Check for CR LF broken across chunks
352355
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
353356
data = data[1:]

0 commit comments

Comments
 (0)