@@ -183,14 +183,10 @@ def __init__(self, source):
183
183
# Such platforms will have already checked for such
184
184
# surrogate errors, so no need to do this checking.
185
185
self .reportCharacterErrors = None
186
- self .replaceCharactersRegexp = None
187
186
elif len ("\U0010FFFF " ) == 1 :
188
187
self .reportCharacterErrors = self .characterErrorsUCS4
189
- self .replaceCharactersRegexp = re .compile (eval ('"[\\ uD800-\\ uDFFF]"' ))
190
188
else :
191
189
self .reportCharacterErrors = self .characterErrorsUCS2
192
- self .replaceCharactersRegexp = re .compile (
193
- eval ('"([\\ uD800-\\ uDBFF](?![\\ uDC00-\\ uDFFF])|(?<![\\ uD800-\\ uDBFF])[\\ uDC00-\\ uDFFF])"' ))
194
190
195
191
# List of where new lines occur
196
192
self .newLines = [0 ]
@@ -288,10 +284,7 @@ def readChunk(self, chunkSize=None):
288
284
if self .reportCharacterErrors :
289
285
self .reportCharacterErrors (data )
290
286
291
- # Replace invalid characters
292
- # Note U+0000 is dealt with in the tokenizer
293
- data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
294
-
287
+ # Replace invalid characters
295
288
data = data .replace ("\r \n " , "\n " )
296
289
data = data .replace ("\r " , "\n " )
297
290
0 commit comments