Skip to content

Commit 8aab9d8

Browse files
committed
Do not directly use isolated surrogates in unicode literals for platforms besides Jython
1 parent 41c90ae commit 8aab9d8

File tree

1 file changed

+35
-9
lines changed

1 file changed

+35
-9
lines changed

html5lib/inputstream.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from six.moves import http_client
44

55
import codecs
6+
import platform
67
import re
78

89
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -28,7 +29,19 @@ class BufferedIOBase(object):
2829
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
2930
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
3031

31-
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
32+
33+
invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
34+
35+
if platform.python_implementation() == "Jython":
36+
# Jython does not allow the use of solitary surrogate escapes
37+
# (\uD800-\uDFFF) in literals or other usage. This is because it
38+
# uses UTF-16, which is based on the use of such surrogates.
39+
invalid_unicode_re = re.compile(invalid_unicode_template % "")
40+
else:
41+
# Instead use one extra step of indirection and create surrogates with
42+
# unichr
43+
invalid_unicode_re = re.compile(invalid_unicode_template % (
44+
"%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
3245

3346
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
3447
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +177,23 @@ def __init__(self, source):
164177
165178
"""
166179

167-
# Craziness
168-
if len("\U0010FFFF") == 1:
180+
if platform.python_implementation() == "Jython":
181+
# By its nature Jython's UTF-16 support does not allow
182+
# surrogate errors, so no need to do this checking.
183+
self.reportCharacterErrors = None
184+
self.replaceCharactersRegexp = None
185+
elif len("\U0010FFFF") == 1:
169186
self.reportCharacterErrors = self.characterErrorsUCS4
170-
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
187+
self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
188+
unichr(0xD800), unichr(0xDFFF)))
171189
else:
172190
self.reportCharacterErrors = self.characterErrorsUCS2
173-
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
191+
self.replaceCharactersRegexp = re.compile(
192+
"([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
193+
unichr(0xD800), unichr(0xDBFF),
194+
unichr(0xDC00), unichr(0xDFFF),
195+
unichr(0xD800), unichr(0xDBFF),
196+
unichr(0xDC00), unichr(0xDFFF)))
174197

175198
# List of where new lines occur
176199
self.newLines = [0]
@@ -265,11 +288,14 @@ def readChunk(self, chunkSize=None):
265288
self._bufferedCharacter = data[-1]
266289
data = data[:-1]
267290

268-
self.reportCharacterErrors(data)
291+
if platform.python_implementation() != "Jython":
292+
# data is already Unicode, so Jython already has dealt
293+
# with any surrogate character errors, no need to go here
294+
self.reportCharacterErrors(data)
269295

270-
# Replace invalid characters
271-
# Note U+0000 is dealt with in the tokenizer
272-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
296+
# Replace invalid characters
297+
# Note U+0000 is dealt with in the tokenizer
298+
data = self.replaceCharactersRegexp.sub("\ufffd", data)
273299

274300
data = data.replace("\r\n", "\n")
275301
data = data.replace("\r", "\n")

0 commit comments

Comments
 (0)