Skip to content

Commit 742715d

Browse files
committed
Fix invalid_unicode_re on platforms supporting lone surrogates
1 parent df0b2ba commit 742715d

File tree

2 files changed

+115
-3
lines changed

2 files changed

+115
-3
lines changed

html5lib/inputstream.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@
3333
# unichr. Not using this indirection would introduce an illegal
3434
# unicode literal on platforms not supporting such lone
3535
# surrogates.
36-
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
37-
eval('"\\uD800-\\uDFFF"'))
36+
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
37+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
38+
eval('"\\uD800-\\uDFFF"') +
39+
"]")
3840
else:
3941
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
4042

html5lib/tests/test_stream.py

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,17 @@
33
from . import support # noqa
44

55
import codecs
6-
from io import BytesIO
6+
import sys
7+
from io import BytesIO, StringIO
8+
9+
import pytest
710

811
import six
912
from six.moves import http_client, urllib
1013

1114
from html5lib.inputstream import (BufferedStream, HTMLInputStream,
1215
HTMLUnicodeInputStream, HTMLBinaryInputStream)
16+
from html5lib.utils import supports_lone_surrogates
1317

1418

1519
def test_basic():
@@ -211,3 +215,109 @@ def makefile(self, _mode, _bufsize=None):
211215
wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
212216
stream = HTMLInputStream(wrapped)
213217
assert stream.charsUntil(" ") == "Text"
218+
219+
220+
@pytest.mark.parametrize("inp,num",
221+
[("\u0000", 0),
222+
("\u0001", 1),
223+
("\u0008", 1),
224+
("\u0009", 0),
225+
("\u000A", 0),
226+
("\u000B", 1),
227+
("\u000C", 0),
228+
("\u000D", 0),
229+
("\u000E", 1),
230+
("\u001F", 1),
231+
("\u0020", 0),
232+
("\u007E", 0),
233+
("\u007F", 1),
234+
("\u009F", 1),
235+
("\u00A0", 0),
236+
("\uFDCF", 0),
237+
("\uFDD0", 1),
238+
("\uFDEF", 1),
239+
("\uFDF0", 0),
240+
("\uFFFD", 0),
241+
("\uFFFE", 1),
242+
("\uFFFF", 1),
243+
("\U0001FFFD", 0),
244+
("\U0001FFFE", 1),
245+
("\U0001FFFF", 1),
246+
("\U0002FFFD", 0),
247+
("\U0002FFFE", 1),
248+
("\U0002FFFF", 1),
249+
("\U0003FFFD", 0),
250+
("\U0003FFFE", 1),
251+
("\U0003FFFF", 1),
252+
("\U0004FFFD", 0),
253+
("\U0004FFFE", 1),
254+
("\U0004FFFF", 1),
255+
("\U0005FFFD", 0),
256+
("\U0005FFFE", 1),
257+
("\U0005FFFF", 1),
258+
("\U0006FFFD", 0),
259+
("\U0006FFFE", 1),
260+
("\U0006FFFF", 1),
261+
("\U0007FFFD", 0),
262+
("\U0007FFFE", 1),
263+
("\U0007FFFF", 1),
264+
("\U0008FFFD", 0),
265+
("\U0008FFFE", 1),
266+
("\U0008FFFF", 1),
267+
("\U0009FFFD", 0),
268+
("\U0009FFFE", 1),
269+
("\U0009FFFF", 1),
270+
("\U000AFFFD", 0),
271+
("\U000AFFFE", 1),
272+
("\U000AFFFF", 1),
273+
("\U000BFFFD", 0),
274+
("\U000BFFFE", 1),
275+
("\U000BFFFF", 1),
276+
("\U000CFFFD", 0),
277+
("\U000CFFFE", 1),
278+
("\U000CFFFF", 1),
279+
("\U000DFFFD", 0),
280+
("\U000DFFFE", 1),
281+
("\U000DFFFF", 1),
282+
("\U000EFFFD", 0),
283+
("\U000EFFFE", 1),
284+
("\U000EFFFF", 1),
285+
("\U000FFFFD", 0),
286+
("\U000FFFFE", 1),
287+
("\U000FFFFF", 1),
288+
("\U0010FFFD", 0),
289+
("\U0010FFFE", 1),
290+
("\U0010FFFF", 1),
291+
("\x01\x01\x01", 3),
292+
("a\x01a\x01a\x01a", 3)])
293+
def test_invalid_codepoints(inp, num):
294+
stream = HTMLUnicodeInputStream(StringIO(inp))
295+
for _i in range(len(inp)):
296+
stream.char()
297+
assert len(stream.errors) == num
298+
299+
300+
@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
301+
@pytest.mark.parametrize("inp,num",
302+
[("'\\uD7FF'", 0),
303+
("'\\uD800'", 1),
304+
("'\\uDBFF'", 1),
305+
("'\\uDC00'", 1),
306+
("'\\uDFFF'", 1),
307+
("'\\uE000'", 0),
308+
("'\\uD800\\uD800\\uD800'", 3),
309+
("'a\\uD800a\\uD800a\\uD800a'", 3),
310+
("'\\uDFFF\\uDBFF'", 2),
311+
pytest.mark.skipif(sys.maxunicode == 0xFFFF,
312+
("'\\uDBFF\\uDFFF'", 2),
313+
reason="narrow Python")])
314+
def test_invalid_codepoints_surrogates(inp, num):
315+
inp = eval(inp)
316+
fp = StringIO(inp)
317+
if ord(max(fp.read())) > 0xFFFF:
318+
pytest.skip("StringIO altered string")
319+
fp.seek(0)
320+
stream = HTMLUnicodeInputStream(fp)
321+
for _i in range(len(inp)):
322+
stream.char()
323+
assert len(stream.errors) == num

0 commit comments

Comments
 (0)