Skip to content

Commit 625303f

Browse files
committed
unicode-escape is undocumented as a decoder, so implement our own.
This also makes it easier to see what's going wrong if something does under Jython.
1 parent f4ee9d3 commit 625303f

File tree

1 file changed

+22
-17
lines changed

1 file changed

+22
-17
lines changed

html5lib/tests/test_tokenizer.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import warnings
55
import re
66

7+
from six import unichr
8+
79
from .support import get_data_files
810

911
from html5lib.tokenizer import HTMLTokenizer
@@ -122,28 +124,31 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
122124
return tokens["expected"] == tokens["received"]
123125

124126

125-
_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")
127+
_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
126128

127129

128130
def unescape(test):
129131
def decode(inp):
132+
def repl(m):
133+
if m.group(2) is not None:
134+
high = int(m.group(1), 16)
135+
low = int(m.group(2), 16)
136+
if (0xD800 <= high <= 0xDBFF and
137+
0xDC00 <= low <= 0xDFFF):
138+
cp = ((high - 0xD800) << 10) + (low - 0xDc00) + 0x10000
139+
return unichr(cp)
140+
else:
141+
return unichr(high) + unichr(low)
142+
else:
143+
return unichr(int(m.group(1), 16))
130144
try:
131-
return inp.encode("utf-8").decode("unicode-escape")
132-
except UnicodeDecodeError:
133-
possible_surrogate_match = _surrogateRe.search(inp)
134-
if possible_surrogate_match and not utils.supports_lone_surrogates:
135-
possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
136-
if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
137-
# Not valid unicode input for platforms that do
138-
# not have support for lone surrogates.
139-
#
140-
# NOTE it's not even possible to have such
141-
# isolated surrogates in unicode input streams in
142-
# such platforms (like Jython) - the decoding to
143-
# unicode would have raised a similar
144-
# UnicodeDecodeError.
145-
return None
146-
raise
145+
return _surrogateRe.sub(repl, inp)
146+
except ValueError:
147+
# This occurs when unichr throws ValueError, which should
148+
# only be for a lone-surrogate.
149+
if utils.supports_lone_surrogates:
150+
raise
151+
return None
147152

148153
test["input"] = decode(test["input"])
149154
for token in test["output"]:

0 commit comments

Comments
 (0)