3
3
from six .moves import http_client
4
4
5
5
import codecs
6
+ import platform
6
7
import re
7
8
8
9
from .constants import EOF , spaceCharacters , asciiLetters , asciiUppercase
@@ -28,7 +29,19 @@ class BufferedIOBase(object):
28
29
asciiUppercaseBytes = frozenset ([item .encode ("ascii" ) for item in asciiUppercase ])
29
30
spacesAngleBrackets = spaceCharactersBytes | frozenset ([b">" , b"<" ])
30
31
31
- invalid_unicode_re = re .compile ("[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
32
+
33
+ invalid_unicode_template = "[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF %s]"
34
+
35
+ if platform .python_implementation () == "Jython" :
36
+ # Jython does not allow the use of solitary surrogate escapes
37
+ # (\uD800-\uDFFF) in literals or other usage. This is because it
38
+ # uses UTF-16, which is based on the use of such surrogates.
39
+ invalid_unicode_re = re .compile (invalid_unicode_template % "" )
40
+ else :
41
+ # Instead use one extra step of indirection and create surrogates with
42
+ # unichr
43
+ invalid_unicode_re = re .compile (invalid_unicode_template % (
44
+ "%s-%s" % (unichr (0xD800 ), unichr (0xDFFF )),))
32
45
33
46
non_bmp_invalid_codepoints = set ([0x1FFFE , 0x1FFFF , 0x2FFFE , 0x2FFFF , 0x3FFFE ,
34
47
0x3FFFF , 0x4FFFE , 0x4FFFF , 0x5FFFE , 0x5FFFF ,
@@ -164,13 +177,23 @@ def __init__(self, source):
164
177
165
178
"""
166
179
167
- # Craziness
168
- if len ("\U0010FFFF " ) == 1 :
180
+ if platform .python_implementation () == "Jython" :
181
+ # By its nature Jython's UTF-16 support does not allow
182
+ # surrogate errors, so no need to do this checking.
183
+ self .reportCharacterErrors = None
184
+ self .replaceCharactersRegexp = None
185
+ elif len ("\U0010FFFF " ) == 1 :
169
186
self .reportCharacterErrors = self .characterErrorsUCS4
170
- self .replaceCharactersRegexp = re .compile ("[\uD800 -\uDFFF ]" )
187
+ self .replaceCharactersRegexp = re .compile ("[%s-%s]" % (
188
+ unichr (0xD800 ), unichr (0xDFFF )))
171
189
else :
172
190
self .reportCharacterErrors = self .characterErrorsUCS2
173
- self .replaceCharactersRegexp = re .compile ("([\uD800 -\uDBFF ](?![\uDC00 -\uDFFF ])|(?<![\uD800 -\uDBFF ])[\uDC00 -\uDFFF ])" )
191
+ self .replaceCharactersRegexp = re .compile (
192
+ "([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
193
+ unichr (0xD800 ), unichr (0xDBFF ),
194
+ unichr (0xDC00 ), unichr (0xDFFF ),
195
+ unichr (0xD800 ), unichr (0xDBFF ),
196
+ unichr (0xDC00 ), unichr (0xDFFF )))
174
197
175
198
# List of where new lines occur
176
199
self .newLines = [0 ]
@@ -265,11 +288,14 @@ def readChunk(self, chunkSize=None):
265
288
self ._bufferedCharacter = data [- 1 ]
266
289
data = data [:- 1 ]
267
290
268
- self .reportCharacterErrors (data )
291
+ if platform .python_implementation () != "Jython" :
292
+ # data is already Unicode, so Jython already has dealt
293
+ # with any surrogate character errors, no need to go here
294
+ self .reportCharacterErrors (data )
269
295
270
- # Replace invalid characters
271
- # Note U+0000 is dealt with in the tokenizer
272
- data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
296
+ # Replace invalid characters
297
+ # Note U+0000 is dealt with in the tokenizer
298
+ data = self .replaceCharactersRegexp .sub ("\ufffd " , data )
273
299
274
300
data = data .replace ("\r \n " , "\n " )
275
301
data = data .replace ("\r " , "\n " )
0 commit comments