Skip to content

Commit 30ea6e4

Browse files
committed
Get rid of obsolete replacement of unpaired surrogates with U+FFFD.
1 parent 7cce65b commit 30ea6e4

File tree

1 file changed

+0
-5
lines changed

1 file changed

+0
-5
lines changed

html5lib/inputstream.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,8 @@ def __init__(self, source):
167167
# Craziness
168168
if len("\U0010FFFF") == 1:
169169
self.reportCharacterErrors = self.characterErrorsUCS4
170-
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
171170
else:
172171
self.reportCharacterErrors = self.characterErrorsUCS2
173-
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
174172

175173
# List of where new lines occur
176174
self.newLines = [0]
@@ -268,9 +266,6 @@ def readChunk(self, chunkSize=None):
268266
self.reportCharacterErrors(data)
269267

270268
# Replace invalid characters
271-
# Note U+0000 is dealt with in the tokenizer
272-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
273-
274269
data = data.replace("\r\n", "\n")
275270
data = data.replace("\r", "\n")
276271

0 commit comments

Comments
 (0)