@@ -28,7 +28,18 @@ class BufferedIOBase(object):
28
28
asciiUppercaseBytes = frozenset ([item .encode ("ascii" ) for item in asciiUppercase ])
29
29
spacesAngleBrackets = spaceCharactersBytes | frozenset ([b">" , b"<" ])
30
30
31
- invalid_unicode_re = re .compile ("[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
31
+
32
+ invalid_unicode_no_surrogate = "[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]"
33
+
34
+ if utils .supports_lone_surrogates :
35
+ # Use one extra step of indirection and create surrogates with
36
+ # unichr. Not using this indirection would introduce an illegal
37
+ # unicode literal on platforms not supporting such lone
38
+ # surrogates.
39
+ invalid_unicode_re = re .compile (invalid_unicode_no_surrogate +
40
+ eval ('"\\ uD800-\\ uDFFF"' ))
41
+ else :
42
+ invalid_unicode_re = re .compile (invalid_unicode_no_surrogate )
32
43
33
44
non_bmp_invalid_codepoints = set ([0x1FFFE , 0x1FFFF , 0x2FFFE , 0x2FFFF , 0x3FFFE ,
34
45
0x3FFFF , 0x4FFFE , 0x4FFFF , 0x5FFFE , 0x5FFFF ,
@@ -164,13 +175,18 @@ def __init__(self, source):
164
175
165
176
"""
166
177
167
- # Craziness
168
- if len ("\U0010FFFF " ) == 1 :
178
+ if not utils .supports_lone_surrogates :
179
+ # Such platforms will have already checked for such
180
+ # surrogate errors, so no need to do this checking.
181
+ self .reportCharacterErrors = None
182
+ self .replaceCharactersRegexp = None
183
+ elif len ("\U0010FFFF " ) == 1 :
169
184
self .reportCharacterErrors = self .characterErrorsUCS4
170
- self .replaceCharactersRegexp = re .compile ("[\uD800 -\uDFFF ]" )
185
+ self .replaceCharactersRegexp = re .compile (eval ( ' "[\\ uD800-\\ uDFFF]"' ) )
171
186
else :
172
187
self .reportCharacterErrors = self .characterErrorsUCS2
173
- self .replaceCharactersRegexp = re .compile ("([\uD800 -\uDBFF ](?![\uDC00 -\uDFFF ])|(?<![\uD800 -\uDBFF ])[\uDC00 -\uDFFF ])" )
188
+ self .replaceCharactersRegexp = re .compile (
189
+ eval ('"([\\ uD800-\\ uDBFF](?![\\ uDC00-\\ uDFFF])|(?<![\\ uD800-\\ uDBFF])[\\ uDC00-\\ uDFFF])"' ))
174
190
175
191
# List of where new lines occur
176
192
self .newLines = [0 ]
@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
265
281
self ._bufferedCharacter = data [- 1 ]
266
282
data = data [:- 1 ]
267
283
268
- self .reportCharacterErrors (data )
284
+ if self .reportCharacterErrors :
285
+ self .reportCharacterErrors (data )
269
286
270
- # Replace invalid characters
271
- # Note U+0000 is dealt with in the tokenizer
272
- data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
287
+ # Replace invalid characters
288
+ # Note U+0000 is dealt with in the tokenizer
289
+ data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
273
290
274
291
data = data .replace ("\r \n " , "\n " )
275
292
data = data .replace ("\r " , "\n " )
0 commit comments