|
4 | 4 | import warnings
|
5 | 5 | import re
|
6 | 6 |
|
| 7 | +from six import unichr |
| 8 | + |
7 | 9 | from .support import get_data_files
|
8 | 10 |
|
9 | 11 | from html5lib.tokenizer import HTMLTokenizer
|
@@ -122,28 +124,31 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
|
122 | 124 | return tokens["expected"] == tokens["received"]
|
123 | 125 |
|
124 | 126 |
|
125 |
| -_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})") |
| 127 | +_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?") |
126 | 128 |
|
127 | 129 |
|
128 | 130 | def unescape(test):
|
129 | 131 | def decode(inp):
|
| 132 | + def repl(m): |
| 133 | + if m.group(2) is not None: |
| 134 | + high = int(m.group(1), 16) |
| 135 | + low = int(m.group(2), 16) |
| 136 | + if (0xD800 <= high <= 0xDBFF and |
| 137 | + 0xDC00 <= low <= 0xDFFF): |
| 138 | + cp = ((high - 0xD800) << 10) + (low - 0xDc00) + 0x10000 |
| 139 | + return unichr(cp) |
| 140 | + else: |
| 141 | + return unichr(high) + unichr(low) |
| 142 | + else: |
| 143 | + return unichr(int(m.group(1), 16)) |
130 | 144 | try:
|
131 |
| - return inp.encode("utf-8").decode("unicode-escape") |
132 |
| - except UnicodeDecodeError: |
133 |
| - possible_surrogate_match = _surrogateRe.search(inp) |
134 |
| - if possible_surrogate_match and not utils.supports_lone_surrogates: |
135 |
| - possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) |
136 |
| - if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: |
137 |
| - # Not valid unicode input for platforms that do |
138 |
| - # not have support for lone surrogates. |
139 |
| - # |
140 |
| - # NOTE it's not even possible to have such |
141 |
| - # isolated surrogates in unicode input streams in |
142 |
| - # such platforms (like Jython) - the decoding to |
143 |
| - # unicode would have raised a similar |
144 |
| - # UnicodeDecodeError. |
145 |
| - return None |
146 |
| - raise |
| 145 | + return _surrogateRe.sub(repl, inp) |
| 146 | + except ValueError: |
| 147 | + # This occurs when unichr throws ValueError, which should |
| 148 | + # only be for a lone-surrogate. |
| 149 | + if utils.supports_lone_surrogates: |
| 150 | + raise |
| 151 | + return None |
147 | 152 |
|
148 | 153 | test["input"] = decode(test["input"])
|
149 | 154 | for token in test["output"]:
|
|
0 commit comments