Skip to content

Commit dc2b477

Browse files
authored
Merge branch 'master' into wbr
2 parents f3c8eb3 + f4646e6 commit dc2b477

File tree

80 files changed

+9203
-961
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+9203
-961
lines changed

.appveyor.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ environment:
66
matrix:
77
- TOXENV: py27-base
88
- TOXENV: py27-optional
9-
- TOXENV: py34-base
10-
- TOXENV: py34-optional
119
- TOXENV: py35-base
1210
- TOXENV: py35-optional
1311
- TOXENV: py36-base
1412
- TOXENV: py36-optional
13+
- TOXENV: py37-base
14+
- TOXENV: py37-optional
15+
- TOXENV: py38-base
16+
- TOXENV: py38-optional
1517

1618
install:
1719
- git submodule update --init --recursive

.pytest.expect

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,22 @@
11
pytest-expect file v1
2-
(2, 7, 11, 'final', 0)
3-
b'html5lib/tests/test_encoding.py::test_encoding::[110]': FAIL
4-
b'html5lib/tests/test_encoding.py::test_encoding::[111]': FAIL
2+
(2, 7, 18, 'final', 0)
3+
b'html5lib/tests/test_encoding.py::test_parser_encoding[<!DOCTYPE HTML>\\n<script>document.write(\'<meta charset="ISO-8859-\' + \'2">\')</script>-iso-8859-2]': FAIL
4+
b'html5lib/tests/test_encoding.py::test_prescan_encoding[<!DOCTYPE HTML>\\n<script>document.write(\'<meta charset="ISO-8859-\' + \'2">\')</script>-iso-8859-2]': FAIL
5+
u'html5lib/tests/testdata/tokenizer/domjs.test::4::cdataSectionState': FAIL
56
u'html5lib/tests/testdata/tokenizer/test2.test::0::dataState': FAIL
6-
u'html5lib/tests/testdata/tokenizer/test3.test::228::dataState': FAIL
7-
u'html5lib/tests/testdata/tokenizer/test3.test::231::dataState': FAIL
8-
u'html5lib/tests/testdata/tokenizer/test3.test::232::dataState': FAIL
9-
u'html5lib/tests/testdata/tokenizer/test3.test::234::dataState': FAIL
10-
u'html5lib/tests/testdata/tokenizer/test3.test::235::dataState': FAIL
11-
u'html5lib/tests/testdata/tokenizer/test3.test::237::dataState': FAIL
12-
u'html5lib/tests/testdata/tokenizer/test3.test::240::dataState': FAIL
13-
u'html5lib/tests/testdata/tokenizer/test3.test::241::dataState': FAIL
14-
u'html5lib/tests/testdata/tokenizer/test3.test::243::dataState': FAIL
15-
u'html5lib/tests/testdata/tokenizer/test3.test::244::dataState': FAIL
16-
u'html5lib/tests/testdata/tokenizer/test3.test::246::dataState': FAIL
17-
u'html5lib/tests/testdata/tokenizer/test3.test::258::dataState': FAIL
18-
u'html5lib/tests/testdata/tokenizer/test3.test::656::dataState': FAIL
7+
u'html5lib/tests/testdata/tokenizer/test3.test::280::dataState': FAIL
8+
u'html5lib/tests/testdata/tokenizer/test3.test::283::dataState': FAIL
9+
u'html5lib/tests/testdata/tokenizer/test3.test::284::dataState': FAIL
10+
u'html5lib/tests/testdata/tokenizer/test3.test::286::dataState': FAIL
11+
u'html5lib/tests/testdata/tokenizer/test3.test::287::dataState': FAIL
12+
u'html5lib/tests/testdata/tokenizer/test3.test::289::dataState': FAIL
13+
u'html5lib/tests/testdata/tokenizer/test3.test::292::dataState': FAIL
14+
u'html5lib/tests/testdata/tokenizer/test3.test::293::dataState': FAIL
15+
u'html5lib/tests/testdata/tokenizer/test3.test::295::dataState': FAIL
16+
u'html5lib/tests/testdata/tokenizer/test3.test::296::dataState': FAIL
17+
u'html5lib/tests/testdata/tokenizer/test3.test::298::dataState': FAIL
18+
u'html5lib/tests/testdata/tokenizer/test3.test::310::dataState': FAIL
19+
u'html5lib/tests/testdata/tokenizer/test3.test::718::dataState': FAIL
1920
u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::DOM::parser::namespaced': FAIL
2021
u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::DOM::parser::void-namespace': FAIL
2122
u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::ElementTree::parser::namespaced': FAIL
@@ -24,6 +25,14 @@ u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::cElementTree::pa
2425
u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::cElementTree::parser::void-namespace': FAIL
2526
u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::lxml::parser::namespaced': FAIL
2627
u'html5lib/tests/testdata/tree-construction/adoption01.dat::17::lxml::parser::void-namespace': FAIL
28+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::DOM::parser::namespaced': FAIL
29+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::DOM::parser::void-namespace': FAIL
30+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::ElementTree::parser::namespaced': FAIL
31+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::ElementTree::parser::void-namespace': FAIL
32+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::cElementTree::parser::namespaced': FAIL
33+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::cElementTree::parser::void-namespace': FAIL
34+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::lxml::parser::namespaced': FAIL
35+
u'html5lib/tests/testdata/tree-construction/blocks.dat::12::lxml::parser::void-namespace': FAIL
2736
u'html5lib/tests/testdata/tree-construction/foreign-fragment.dat::0::DOM::parser::namespaced': FAIL
2837
u'html5lib/tests/testdata/tree-construction/foreign-fragment.dat::0::DOM::parser::void-namespace': FAIL
2938
u'html5lib/tests/testdata/tree-construction/foreign-fragment.dat::0::ElementTree::parser::namespaced': FAIL
@@ -216,30 +225,6 @@ u'html5lib/tests/testdata/tree-construction/isindex.dat::3::cElementTree::parser
216225
u'html5lib/tests/testdata/tree-construction/isindex.dat::3::cElementTree::parser::void-namespace': FAIL
217226
u'html5lib/tests/testdata/tree-construction/isindex.dat::3::lxml::parser::namespaced': FAIL
218227
u'html5lib/tests/testdata/tree-construction/isindex.dat::3::lxml::parser::void-namespace': FAIL
219-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::DOM::parser::namespaced': FAIL
220-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::DOM::parser::void-namespace': FAIL
221-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::ElementTree::parser::namespaced': FAIL
222-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::ElementTree::parser::void-namespace': FAIL
223-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::cElementTree::parser::namespaced': FAIL
224-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::cElementTree::parser::void-namespace': FAIL
225-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::lxml::parser::namespaced': FAIL
226-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::3::lxml::parser::void-namespace': FAIL
227-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::DOM::parser::namespaced': FAIL
228-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::DOM::parser::void-namespace': FAIL
229-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::ElementTree::parser::namespaced': FAIL
230-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::ElementTree::parser::void-namespace': FAIL
231-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::cElementTree::parser::namespaced': FAIL
232-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::cElementTree::parser::void-namespace': FAIL
233-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::lxml::parser::namespaced': FAIL
234-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::4::lxml::parser::void-namespace': FAIL
235-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::DOM::parser::namespaced': FAIL
236-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::DOM::parser::void-namespace': FAIL
237-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::ElementTree::parser::namespaced': FAIL
238-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::ElementTree::parser::void-namespace': FAIL
239-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::cElementTree::parser::namespaced': FAIL
240-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::cElementTree::parser::void-namespace': FAIL
241-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::lxml::parser::namespaced': FAIL
242-
u'html5lib/tests/testdata/tree-construction/menuitem-element.dat::5::lxml::parser::void-namespace': FAIL
243228
u'html5lib/tests/testdata/tree-construction/namespace-sensitivity.dat::0::DOM::parser::namespaced': FAIL
244229
u'html5lib/tests/testdata/tree-construction/namespace-sensitivity.dat::0::DOM::parser::void-namespace': FAIL
245230
u'html5lib/tests/testdata/tree-construction/namespace-sensitivity.dat::0::ElementTree::parser::namespaced': FAIL
@@ -1248,6 +1233,14 @@ u'html5lib/tests/testdata/tree-construction/tests11.dat::6::cElementTree::parser
12481233
u'html5lib/tests/testdata/tree-construction/tests11.dat::6::cElementTree::parser::void-namespace': FAIL
12491234
u'html5lib/tests/testdata/tree-construction/tests11.dat::6::lxml::parser::namespaced': FAIL
12501235
u'html5lib/tests/testdata/tree-construction/tests11.dat::6::lxml::parser::void-namespace': FAIL
1236+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::DOM::parser::namespaced': FAIL
1237+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::DOM::parser::void-namespace': FAIL
1238+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::ElementTree::parser::namespaced': FAIL
1239+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::ElementTree::parser::void-namespace': FAIL
1240+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::cElementTree::parser::namespaced': FAIL
1241+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::cElementTree::parser::void-namespace': FAIL
1242+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::lxml::parser::namespaced': FAIL
1243+
u'html5lib/tests/testdata/tree-construction/tests18.dat::15::lxml::parser::void-namespace': FAIL
12511244
u'html5lib/tests/testdata/tree-construction/tests19.dat::14::DOM::parser::namespaced': FAIL
12521245
u'html5lib/tests/testdata/tree-construction/tests19.dat::14::DOM::parser::void-namespace': FAIL
12531246
u'html5lib/tests/testdata/tree-construction/tests19.dat::14::ElementTree::parser::namespaced': FAIL
@@ -1296,6 +1289,14 @@ u'html5lib/tests/testdata/tree-construction/tests25.dat::7::cElementTree::parser
12961289
u'html5lib/tests/testdata/tree-construction/tests25.dat::7::cElementTree::parser::void-namespace': FAIL
12971290
u'html5lib/tests/testdata/tree-construction/tests25.dat::7::lxml::parser::namespaced': FAIL
12981291
u'html5lib/tests/testdata/tree-construction/tests25.dat::7::lxml::parser::void-namespace': FAIL
1292+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::DOM::parser::namespaced': FAIL
1293+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::DOM::parser::void-namespace': FAIL
1294+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::ElementTree::parser::namespaced': FAIL
1295+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::ElementTree::parser::void-namespace': FAIL
1296+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::cElementTree::parser::namespaced': FAIL
1297+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::cElementTree::parser::void-namespace': FAIL
1298+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::lxml::parser::namespaced': FAIL
1299+
u'html5lib/tests/testdata/tree-construction/tests8.dat::5::lxml::parser::void-namespace': FAIL
12991300
u'html5lib/tests/testdata/tree-construction/webkit02.dat::14::DOM::parser::namespaced': FAIL
13001301
u'html5lib/tests/testdata/tree-construction/webkit02.dat::14::DOM::parser::void-namespace': FAIL
13011302
u'html5lib/tests/testdata/tree-construction/webkit02.dat::14::ElementTree::parser::namespaced': FAIL

.travis.yml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
11
language: python
22
python:
3+
- "pypy3"
34
- "pypy"
5+
- "3.8"
6+
- "3.7"
47
- "3.6"
58
- "3.5"
6-
- "3.4"
79
- "2.7"
8-
9-
sudo: false
10+
- "3.9-dev"
1011

1112
cache: pip
1213

1314
env:
1415
global:
1516
- PYTEST_COMMAND="coverage run -m pytest"
16-
matrix:
17-
- TOXENV=optional
18-
- TOXENV=base
19-
- TOXENV=six19-optional
17+
- TOXENV=base,optional,six19-optional
2018

2119
install:
2220
- pip install tox codecov

AUTHORS.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Credits
44
``html5lib`` is written and maintained by:
55

66
- James Graham
7-
- Geoffrey Sneddon
7+
- Sam Sneddon
88
- Łukasz Langa
99
- Will Kahn-Greene
1010

CHANGES.rst

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,40 @@
11
Change Log
22
----------
33

4-
1.1.0
5-
~~~~~
4+
1.2
5+
~~~
66

77
Features:
88

99
* Add support for the ``<wbr>`` element, `which indicates a line break
1010
opportunity <https://html.spec.whatwg.org/#the-wbr-element>`_. This element
1111
is allowed by default by the sanitizer. (#395) (Thank you, Tom Most!)
1212

13+
14+
1.1
15+
~~~
16+
17+
UNRELEASED
18+
19+
Breaking changes:
20+
21+
* Drop support for Python 3.3. (#358)
22+
* Drop support for Python 3.4. (#421)
23+
24+
Deprecations:
25+
26+
* Deprecate the ``html5lib`` sanitizer (``html5lib.serialize(sanitize=True)`` and
27+
``html5lib.filters.sanitizer``). We recommend users migrate to `Bleach
28+
<https://github.com/mozilla/bleach>`. Please let us know if Bleach doesn't suffice for your
29+
use. (#443)
30+
31+
Other changes:
32+
33+
* Try to import from ``collections.abc`` to remove DeprecationWarning and ensure
34+
``html5lib`` keeps working in future Python versions. (#403)
35+
* Drop optional ``datrie`` dependency. (#442)
36+
37+
1338
1.0.1
1439
~~~~~
1540

@@ -29,7 +54,7 @@ Features:
2954
* Support Python 3.6. (#333) (Thank you, Jon Dufresne!)
3055
* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!)
3156
* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon
32-
Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!)
57+
Dufresne, John Vandenberg, Sam Sneddon, Will Kahn-Greene!)
3358
* Semver-compliant version number.
3459

3560
Bug fixes:

README.rst

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,23 +91,22 @@ More documentation is available at https://html5lib.readthedocs.io/.
9191
Installation
9292
------------
9393

94-
html5lib works on CPython 2.7+, CPython 3.4+ and PyPy. To install it,
95-
use:
94+
html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
9695

9796
.. code-block:: bash
9897
9998
$ pip install html5lib
10099
100+
The goal is to support a (non-strict) superset of the versions that `pip
101+
supports
102+
<https://pip.pypa.io/en/stable/installing/#python-and-os-compatibility>`_.
101103

102104
Optional Dependencies
103105
---------------------
104106

105107
The following third-party libraries may be used for additional
106108
functionality:
107109

108-
- ``datrie`` can be used under CPython to improve parsing performance
109-
(though in almost all cases the improvement is marginal);
110-
111110
- ``lxml`` is supported as a tree format (for both building and
112111
walking) under CPython (but *not* PyPy where it is known to cause
113112
segfaults);

benchmarks/bench_html.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import io
2+
import os
3+
import sys
4+
5+
import pyperf
6+
7+
sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
8+
import html5lib # noqa: E402
9+
10+
11+
def bench_parse(fh, treebuilder):
12+
fh.seek(0)
13+
html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)
14+
15+
16+
def bench_serialize(loops, fh, treebuilder):
17+
fh.seek(0)
18+
doc = html5lib.parse(fh, treebuilder=treebuilder, useChardet=False)
19+
20+
range_it = range(loops)
21+
t0 = pyperf.perf_counter()
22+
23+
for loops in range_it:
24+
html5lib.serialize(doc, tree=treebuilder, encoding="ascii", inject_meta_charset=False)
25+
26+
return pyperf.perf_counter() - t0
27+
28+
29+
BENCHMARKS = ["parse", "serialize"]
30+
31+
32+
def add_cmdline_args(cmd, args):
33+
if args.benchmark:
34+
cmd.append(args.benchmark)
35+
36+
37+
if __name__ == "__main__":
38+
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
39+
runner.metadata["description"] = "Run benchmarks based on Anolis"
40+
runner.argparser.add_argument("benchmark", nargs="?", choices=BENCHMARKS)
41+
42+
args = runner.parse_args()
43+
if args.benchmark:
44+
benchmarks = (args.benchmark,)
45+
else:
46+
benchmarks = BENCHMARKS
47+
48+
with open(os.path.join(os.path.dirname(__file__), "data", "html.html"), "rb") as fh:
49+
source = io.BytesIO(fh.read())
50+
51+
if "parse" in benchmarks:
52+
for tb in ("etree", "dom", "lxml"):
53+
runner.bench_func("html_parse_%s" % tb, bench_parse, source, tb)
54+
55+
if "serialize" in benchmarks:
56+
for tb in ("etree", "dom", "lxml"):
57+
runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)

benchmarks/bench_wpt.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import io
2+
import os
3+
import sys
4+
5+
import pyperf
6+
7+
sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
8+
import html5lib # noqa: E402
9+
10+
11+
def bench_html5lib(fh):
12+
fh.seek(0)
13+
html5lib.parse(fh, treebuilder="etree", useChardet=False)
14+
15+
16+
def add_cmdline_args(cmd, args):
17+
if args.benchmark:
18+
cmd.append(args.benchmark)
19+
20+
21+
BENCHMARKS = {}
22+
for root, dirs, files in os.walk(os.path.join(os.path.dirname(os.path.abspath(__file__)), "data", "wpt")):
23+
for f in files:
24+
if f.endswith(".html"):
25+
BENCHMARKS[f[: -len(".html")]] = os.path.join(root, f)
26+
27+
28+
if __name__ == "__main__":
29+
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
30+
runner.metadata["description"] = "Run parser benchmarks from WPT"
31+
runner.argparser.add_argument("benchmark", nargs="?", choices=sorted(BENCHMARKS))
32+
33+
args = runner.parse_args()
34+
if args.benchmark:
35+
benchmarks = (args.benchmark,)
36+
else:
37+
benchmarks = sorted(BENCHMARKS)
38+
39+
for bench in benchmarks:
40+
name = "wpt_%s" % bench
41+
path = BENCHMARKS[bench]
42+
with open(path, "rb") as fh:
43+
fh2 = io.BytesIO(fh.read())
44+
45+
runner.bench_func(name, bench_html5lib, fh2)

benchmarks/data/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
The files in this data are derived from:
2+
3+
* `html.html`: from [html](http://github.com/whatwg/html), revision
4+
77db356a293f2b152b648c836b6989d17afe42bb. This is the first 5000 lines of `source`. (This is
5+
representative of the input to [Anolis](https://bitbucket.org/ms2ger/anolis/); first 5000 lines
6+
chosen to make it parse in a reasonable time.)
7+
8+
* `wpt`: see `wpt/README.md`.

0 commit comments

Comments
 (0)