|
3 | 3 | from . import support # noqa
|
4 | 4 |
|
5 | 5 | import codecs
|
6 |
| -from io import BytesIO |
| 6 | +import sys |
| 7 | +from io import BytesIO, StringIO |
| 8 | + |
| 9 | +import pytest |
7 | 10 |
|
8 | 11 | import six
|
9 | 12 | from six.moves import http_client, urllib
|
10 | 13 |
|
11 | 14 | from html5lib.inputstream import (BufferedStream, HTMLInputStream,
|
12 | 15 | HTMLUnicodeInputStream, HTMLBinaryInputStream)
|
| 16 | +from html5lib.utils import supports_lone_surrogates |
13 | 17 |
|
14 | 18 |
|
15 | 19 | def test_basic():
|
@@ -211,3 +215,109 @@ def makefile(self, _mode, _bufsize=None):
|
211 | 215 | wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
|
212 | 216 | stream = HTMLInputStream(wrapped)
|
213 | 217 | assert stream.charsUntil(" ") == "Text"
|
| 218 | + |
| 219 | + |
| 220 | +@pytest.mark.parametrize("inp,num", |
| 221 | + [("\u0000", 0), |
| 222 | + ("\u0001", 1), |
| 223 | + ("\u0008", 1), |
| 224 | + ("\u0009", 0), |
| 225 | + ("\u000A", 0), |
| 226 | + ("\u000B", 1), |
| 227 | + ("\u000C", 0), |
| 228 | + ("\u000D", 0), |
| 229 | + ("\u000E", 1), |
| 230 | + ("\u001F", 1), |
| 231 | + ("\u0020", 0), |
| 232 | + ("\u007E", 0), |
| 233 | + ("\u007F", 1), |
| 234 | + ("\u009F", 1), |
| 235 | + ("\u00A0", 0), |
| 236 | + ("\uFDCF", 0), |
| 237 | + ("\uFDD0", 1), |
| 238 | + ("\uFDEF", 1), |
| 239 | + ("\uFDF0", 0), |
| 240 | + ("\uFFFD", 0), |
| 241 | + ("\uFFFE", 1), |
| 242 | + ("\uFFFF", 1), |
| 243 | + ("\U0001FFFD", 0), |
| 244 | + ("\U0001FFFE", 1), |
| 245 | + ("\U0001FFFF", 1), |
| 246 | + ("\U0002FFFD", 0), |
| 247 | + ("\U0002FFFE", 1), |
| 248 | + ("\U0002FFFF", 1), |
| 249 | + ("\U0003FFFD", 0), |
| 250 | + ("\U0003FFFE", 1), |
| 251 | + ("\U0003FFFF", 1), |
| 252 | + ("\U0004FFFD", 0), |
| 253 | + ("\U0004FFFE", 1), |
| 254 | + ("\U0004FFFF", 1), |
| 255 | + ("\U0005FFFD", 0), |
| 256 | + ("\U0005FFFE", 1), |
| 257 | + ("\U0005FFFF", 1), |
| 258 | + ("\U0006FFFD", 0), |
| 259 | + ("\U0006FFFE", 1), |
| 260 | + ("\U0006FFFF", 1), |
| 261 | + ("\U0007FFFD", 0), |
| 262 | + ("\U0007FFFE", 1), |
| 263 | + ("\U0007FFFF", 1), |
| 264 | + ("\U0008FFFD", 0), |
| 265 | + ("\U0008FFFE", 1), |
| 266 | + ("\U0008FFFF", 1), |
| 267 | + ("\U0009FFFD", 0), |
| 268 | + ("\U0009FFFE", 1), |
| 269 | + ("\U0009FFFF", 1), |
| 270 | + ("\U000AFFFD", 0), |
| 271 | + ("\U000AFFFE", 1), |
| 272 | + ("\U000AFFFF", 1), |
| 273 | + ("\U000BFFFD", 0), |
| 274 | + ("\U000BFFFE", 1), |
| 275 | + ("\U000BFFFF", 1), |
| 276 | + ("\U000CFFFD", 0), |
| 277 | + ("\U000CFFFE", 1), |
| 278 | + ("\U000CFFFF", 1), |
| 279 | + ("\U000DFFFD", 0), |
| 280 | + ("\U000DFFFE", 1), |
| 281 | + ("\U000DFFFF", 1), |
| 282 | + ("\U000EFFFD", 0), |
| 283 | + ("\U000EFFFE", 1), |
| 284 | + ("\U000EFFFF", 1), |
| 285 | + ("\U000FFFFD", 0), |
| 286 | + ("\U000FFFFE", 1), |
| 287 | + ("\U000FFFFF", 1), |
| 288 | + ("\U0010FFFD", 0), |
| 289 | + ("\U0010FFFE", 1), |
| 290 | + ("\U0010FFFF", 1), |
| 291 | + ("\x01\x01\x01", 3), |
| 292 | + ("a\x01a\x01a\x01a", 3)]) |
| 293 | +def test_invalid_codepoints(inp, num): |
| 294 | + stream = HTMLUnicodeInputStream(StringIO(inp)) |
| 295 | + for _i in range(len(inp)): |
| 296 | + stream.char() |
| 297 | + assert len(stream.errors) == num |
| 298 | + |
| 299 | + |
| 300 | +@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates") |
| 301 | +@pytest.mark.parametrize("inp,num", |
| 302 | + [("'\\uD7FF'", 0), |
| 303 | + ("'\\uD800'", 1), |
| 304 | + ("'\\uDBFF'", 1), |
| 305 | + ("'\\uDC00'", 1), |
| 306 | + ("'\\uDFFF'", 1), |
| 307 | + ("'\\uE000'", 0), |
| 308 | + ("'\\uD800\\uD800\\uD800'", 3), |
| 309 | + ("'a\\uD800a\\uD800a\\uD800a'", 3), |
| 310 | + ("'\\uDFFF\\uDBFF'", 2), |
| 311 | + pytest.mark.skipif(sys.maxunicode == 0xFFFF, |
| 312 | + ("'\\uDBFF\\uDFFF'", 2), |
| 313 | + reason="narrow Python")]) |
| 314 | +def test_invalid_codepoints_surrogates(inp, num): |
| 315 | + inp = eval(inp) |
| 316 | + fp = StringIO(inp) |
| 317 | + if ord(max(fp.read())) > 0xFFFF: |
| 318 | + pytest.skip("StringIO altered string") |
| 319 | + fp.seek(0) |
| 320 | + stream = HTMLUnicodeInputStream(fp) |
| 321 | + for _i in range(len(inp)): |
| 322 | + stream.char() |
| 323 | + assert len(stream.errors) == num |
0 commit comments