Skip to content

Commit b51828b

Browse files
committed
Allow for Python implementations that don't support lone surrogates (read: Jython).
This is based on earlier work by Jim Baker (thanks!). The two major parts of this are: * Avoiding having lone surrogates in any string literals, and * Avoiding tests that contain lone surrogates. As part of this, the decoder for double-escaped tokenizer tests is rewritten to avoid unicode_escape as that has bogus behaviour with non-ASCII characters.
1 parent b293489 commit b51828b

File tree

5 files changed

+87
-14
lines changed

5 files changed

+87
-14
lines changed

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ Patches and suggestions
3232
- Juan Carlos Garcia Segovia
3333
- Mike West
3434
- Marc DM
35+
- Jim Baker

CHANGES.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@ Change Log
44
0.9999
55
~~~~~~
66

7-
Released on XXX, 2014
7+
Released on XXX, 2015
88

9-
* XXX
9+
* Add support for Python implementations that don't support lone surrogates
10+
(read: Jython).
1011

1112

1213
0.999

html5lib/inputstream.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,18 @@ class BufferedIOBase(object):
2828
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
2929
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
3030

31-
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
31+
32+
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
33+
34+
if utils.supports_lone_surrogates:
35+
# Use one extra step of indirection and create surrogates with
36+
# unichr. Not using this indirection would introduce an illegal
37+
# unicode literal on platforms not supporting such lone
38+
# surrogates.
39+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
40+
eval('"\\uD800-\\uDFFF"'))
41+
else:
42+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
3243

3344
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
3445
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,18 @@ def __init__(self, source):
164175
165176
"""
166177

167-
# Craziness
168-
if len("\U0010FFFF") == 1:
178+
if not utils.supports_lone_surrogates:
179+
# Such platforms will have already checked for such
180+
# surrogate errors, so no need to do this checking.
181+
self.reportCharacterErrors = None
182+
self.replaceCharactersRegexp = None
183+
elif len("\U0010FFFF") == 1:
169184
self.reportCharacterErrors = self.characterErrorsUCS4
170-
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
185+
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
171186
else:
172187
self.reportCharacterErrors = self.characterErrorsUCS2
173-
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
188+
self.replaceCharactersRegexp = re.compile(
189+
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
174190

175191
# List of where new lines occur
176192
self.newLines = [0]
@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
265281
self._bufferedCharacter = data[-1]
266282
data = data[:-1]
267283

268-
self.reportCharacterErrors(data)
284+
if self.reportCharacterErrors:
285+
self.reportCharacterErrors(data)
269286

270-
# Replace invalid characters
271-
# Note U+0000 is dealt with in the tokenizer
272-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
287+
# Replace invalid characters
288+
# Note U+0000 is dealt with in the tokenizer
289+
data = self.replaceCharactersRegexp.sub("\ufffd", data)
273290

274291
data = data.replace("\r\n", "\n")
275292
data = data.replace("\r", "\n")

html5lib/tests/test_tokenizer.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
import warnings
55
import re
66

7+
from six import unichr
8+
79
from .support import get_data_files
810

911
from html5lib.tokenizer import HTMLTokenizer
10-
from html5lib import constants
12+
from html5lib import constants, utils
1113

1214

1315
class TokenizerTestParser(object):
@@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
122124
return tokens["expected"] == tokens["received"]
123125

124126

127+
_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
128+
129+
125130
def unescape(test):
126131
def decode(inp):
127-
return inp.encode("utf-8").decode("unicode-escape")
132+
"""Decode \\uXXXX escapes
133+
134+
This decodes \\uXXXX escapes, possibly into non-BMP characters when
135+
two surrogate character escapes are adjacent to each other.
136+
"""
137+
# This cannot be implemented using the unicode_escape codec
138+
# because that requires its input be ISO-8859-1, and we need
139+
# arbitrary unicode as input.
140+
def repl(m):
141+
if m.group(2) is not None:
142+
high = int(m.group(1), 16)
143+
low = int(m.group(2), 16)
144+
if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
145+
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
146+
return unichr(cp)
147+
else:
148+
return unichr(high) + unichr(low)
149+
else:
150+
return unichr(int(m.group(1), 16))
151+
try:
152+
return _surrogateRe.sub(repl, inp)
153+
except ValueError:
154+
# This occurs when unichr throws ValueError, which should
155+
# only be for a lone-surrogate.
156+
if utils.supports_lone_surrogates:
157+
raise
158+
return None
128159

129160
test["input"] = decode(test["input"])
130161
for token in test["output"]:
@@ -183,6 +214,8 @@ def testTokenizer():
183214
test["initialStates"] = ["Data state"]
184215
if 'doubleEscaped' in test:
185216
test = unescape(test)
217+
if test["input"] is None:
218+
continue # Not valid input for this platform
186219
for initialState in test["initialStates"]:
187220
test["initialState"] = capitalize(initialState)
188221
yield runTokenizerTest, test

html5lib/utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,35 @@
22

33
from types import ModuleType
44

5+
from six import text_type
6+
57
try:
68
import xml.etree.cElementTree as default_etree
79
except ImportError:
810
import xml.etree.ElementTree as default_etree
911

1012

1113
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
12-
"surrogatePairToCodepoint", "moduleFactoryFactory"]
14+
"surrogatePairToCodepoint", "moduleFactoryFactory",
15+
"supports_lone_surrogates"]
16+
17+
18+
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
19+
# caught by the below test. In general this would be any platform
20+
# using UTF-16 as its encoding of unicode strings, such as
21+
# Jython. This is because UTF-16 itself is based on the use of such
22+
# surrogates, and there is no mechanism to further escape such
23+
# escapes.
24+
try:
25+
_x = eval('"\\uD800"')
26+
if not isinstance(_x, text_type):
27+
# We need this with u"" because of http://bugs.jython.org/issue2039
28+
_x = eval('u"\\uD800"')
29+
assert isinstance(_x, text_type)
30+
except:
31+
supports_lone_surrogates = False
32+
else:
33+
supports_lone_surrogates = True
1334

1435

1536
class MethodDispatcher(dict):

0 commit comments

Comments
 (0)