Skip to content

Commit dda96f8

Browse files
committed
Get rid of obsolete replacement of unpaired surrogates with U+FFFD.
1 parent 46dae3d commit dda96f8

File tree

2 files changed

+1
-8
lines changed

2 files changed

+1
-8
lines changed

.pytest.expect

-228 Bytes
Binary file not shown.

html5lib/inputstream.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,10 @@ def __init__(self, source):
183183
# Such platforms will have already checked for such
184184
# surrogate errors, so no need to do this checking.
185185
self.reportCharacterErrors = None
186-
self.replaceCharactersRegexp = None
187186
elif len("\U0010FFFF") == 1:
188187
self.reportCharacterErrors = self.characterErrorsUCS4
189-
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
190188
else:
191189
self.reportCharacterErrors = self.characterErrorsUCS2
192-
self.replaceCharactersRegexp = re.compile(
193-
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
194190

195191
# List of where new lines occur
196192
self.newLines = [0]
@@ -288,10 +284,7 @@ def readChunk(self, chunkSize=None):
288284
if self.reportCharacterErrors:
289285
self.reportCharacterErrors(data)
290286

291-
# Replace invalid characters
292-
# Note U+0000 is dealt with in the tokenizer
293-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
294-
287+
# Replace invalid characters
295288
data = data.replace("\r\n", "\n")
296289
data = data.replace("\r", "\n")
297290

0 commit comments

Comments
 (0)