From 224d9f4e243f6645e88b32ad7342a55128f19eeb Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 08:22:35 -0700 Subject: [PATCH 01/19] Fix formatting of docstring example It runs together in the built HTML. --- html5lib/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index f3cd9455..e615ffae 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -4,11 +4,11 @@ HTML found in the wild and implements well-defined error recovery that is largely compatible with modern desktop web browsers. -Example usage: +Example usage:: -import html5lib -f = open("my_document.html") -tree = html5lib.parse(f) + import html5lib + f = open("my_document.html") + tree = html5lib.parse(f) """ from __future__ import absolute_import, division, unicode_literals From 3fb6af3f78d1cc95b25b71306a2d82b0c12f7996 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 08:24:30 -0700 Subject: [PATCH 02/19] Use with, it's idiomatic --- html5lib/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index e615ffae..b4639bde 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -7,8 +7,8 @@ Example usage:: import html5lib - f = open("my_document.html") - tree = html5lib.parse(f) + with open("my_document.html") as f: + tree = html5lib.parse(f) """ from __future__ import absolute_import, division, unicode_literals From ba63e09ed9421b3aecf7ac97b0e943f2f74e1825 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:27:34 -0700 Subject: [PATCH 03/19] Fix typo in changelog --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 570c9605..ce2d2c5b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -63,7 +63,7 @@ Released on July 14, 2016 to clarify their status as public.** * **Get rid of the sanitizer package. Merge sanitizer.sanitize into the - sanitizer.htmlsanitizer module and move that to saniziter. This means + sanitizer.htmlsanitizer module and move that to sanitizer. This means anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no code changes.** From 6b99d52ad6867ee6262ddfcac42a501be54685c7 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:28:17 -0700 Subject: [PATCH 04/19] Export and document html5lib.__version__ It's not much use if it's private. --- html5lib/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index b4639bde..f202d5b3 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -19,7 +19,8 @@ from .serializer import serialize __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", - "getTreeWalker", "serialize"] + "getTreeWalker", "serialize", "__version__"] # this has to be at the top level, see how setup.py parses this +#: Distribution version number, which asymptotically approaches 1. __version__ = "0.9999999999-dev" From 323d736ca03bbd2535987eed9b55f6f000f5fba5 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:28:38 -0700 Subject: [PATCH 05/19] Add a documentation env to tox.ini Run "tox -e doc" to build the documentation in doc/_build. --- tox.ini | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index da64de71..830999cd 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional} +envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional},doc [testenv] deps = @@ -11,7 +11,12 @@ deps = base: webencodings py26-base: ordereddict optional: -r{toxinidir}/requirements-optional.txt + doc: Sphinx commands = {envbindir}/py.test {toxinidir}/flake8-run.sh + +[testenv:doc] +changedir = doc +commands = sphinx-build -b html . _build From 964d0e166ac49062c6cbabc1263f624332e3afec Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:31:56 -0700 Subject: [PATCH 06/19] Clean up html5lib module documentation Right now the docs have entries for re-exports like html5lib.__init__.HTMLParser, including full class documentation. This is redundant with the docs for html5lib.html5parser.HTMLParser, which is a public name anyway, so I think that it is best to be explicit that this is a re-export. --- doc/html5lib.rst | 6 ++---- html5lib/__init__.py | 7 +++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/html5lib.rst b/doc/html5lib.rst index f0646aac..22af7728 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -4,10 +4,8 @@ html5lib Package :mod:`html5lib` Package ----------------------- -.. automodule:: html5lib.__init__ - :members: - :undoc-members: - :show-inheritance: +.. automodule:: html5lib + :members: __version__ :mod:`constants` Module ----------------------- diff --git a/html5lib/__init__.py b/html5lib/__init__.py index f202d5b3..745b9342 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -9,6 +9,13 @@ import html5lib with open("my_document.html") as f: tree = html5lib.parse(f) + +For convenience, this module re-exports the following names: + +* :func:`~.html5parser.parse`, :func:`~.html5parser.parseFragment`, and :class:`~.html5parser.HTMLParser` +* :func:`~.treebuilders.getTreeBuilder` +* :func:`~.treewalkers.getTreeWalker` +* :func:`~.serializer.serialize` """ from __future__ import absolute_import, division, unicode_literals From abf622432db7169f2af7ecc71980b923643f2711 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:36:43 -0700 Subject: [PATCH 07/19] Remove docs for HTMLTokenizer and HTMLSanitizer HTMLTokenizer is now a private API (I cannot find a public export). HTMLSanitizer no longer exists as a tokenizer, and has been replaced with a filter. --- doc/movingparts.rst | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 80ee2ad1..3eeff4f2 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -169,41 +169,3 @@ the following way: * If all else fails, the default encoding will be used. This is usually `Windows-1252 `_, which is a common fallback used by Web browsers. - - -Tokenizers ----------- - -The part of the parser responsible for translating a raw input stream -into meaningful tokens is the tokenizer. Currently html5lib provides -two. - -To set up a tokenizer, simply pass it when instantiating -a :class:`~html5lib.html5parser.HTMLParser`: - -.. code-block:: python - - import html5lib - from html5lib import sanitizer - - p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) - p.parse("

Surprise!") - -HTMLTokenizer -~~~~~~~~~~~~~ - -This is the default tokenizer, the heart of html5lib. The implementation -can be found in `html5lib/tokenizer.py -`_. - -HTMLSanitizer -~~~~~~~~~~~~~ - -This is a tokenizer that removes unsafe markup and CSS styles from the -input. Elements that are known to be safe are passed through and the -rest is converted to visible text. The default configuration of the -sanitizer follows the `WHATWG Sanitization Rules -`_. - -The implementation can be found in `html5lib/sanitizer.py -`_. From 85540983f6285c82f2a1c4a8d756ae58d0c1e713 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:40:10 -0700 Subject: [PATCH 08/19] Fix Sphinx title underline warnings --- doc/html5lib.rst | 2 +- doc/html5lib.treewalkers.rst | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/html5lib.rst b/doc/html5lib.rst index 22af7728..44e34573 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -24,7 +24,7 @@ html5lib Package :show-inheritance: :mod:`serializer` Module ----------------------- +------------------------ .. automodule:: html5lib.serializer :members: diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst index 46501258..085d8a98 100644 --- a/doc/html5lib.treewalkers.rst +++ b/doc/html5lib.treewalkers.rst @@ -10,7 +10,7 @@ treewalkers Package :show-inheritance: :mod:`base` Module -------------------- +------------------ .. automodule:: html5lib.treewalkers.base :members: @@ -34,7 +34,7 @@ treewalkers Package :show-inheritance: :mod:`etree_lxml` Module ------------------------ +------------------------ .. automodule:: html5lib.treewalkers.etree_lxml :members: @@ -43,9 +43,9 @@ treewalkers Package :mod:`genshi` Module --------------------------- +-------------------- .. automodule:: html5lib.treewalkers.genshi :members: :undoc-members: - :show-inheritance: \ No newline at end of file + :show-inheritance: From c8fca0ecc7c704995947601e03da0c34a85ecdf5 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:50:52 -0700 Subject: [PATCH 09/19] Open in binary mode for Python 3 --- html5lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 745b9342..b1970d29 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -7,7 +7,7 @@ Example usage:: import html5lib - with open("my_document.html") as f: + with open("my_document.html", "rb") as f: tree = html5lib.parse(f) For convenience, this module re-exports the following names: From 637826ffa72ca982dff6ae7204e4afcc35f3e29e Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 11:51:16 -0700 Subject: [PATCH 10/19] Update and expand "moving parts" doc --- doc/movingparts.rst | 65 +++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 3eeff4f2..1f3086cb 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -4,22 +4,25 @@ The moving parts html5lib consists of a number of components, which are responsible for handling its features. +Parsing uses a *tree builder* to generate a *tree*, the in-memory representation of the document. +Several tree representations are supported, as are translations to other formats via *tree adapters*. +The tree may be translated to a token stream with a *tree walker*, from which :class:`~html5lib.serializer.HTMLSerializer` produces a stream of bytes. +The token stream may also be transformed by use of *filters* to accomplish tasks like sanitization. Tree builders ------------- The parser reads HTML by tokenizing the content and building a tree that -the user can later access. There are three main types of trees that -html5lib can build: +the user can later access. html5lib can build three types of trees: -* ``etree`` - this is the default; builds a tree based on ``xml.etree``, +* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`, which can be found in the standard library. Whenever possible, the accelerated ``ElementTree`` implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x) is used. -* ``dom`` - builds a tree based on ``xml.dom.minidom``. +* ``dom`` - builds a tree based on :mod:`xml.dom.minidom`. -* ``lxml.etree`` - uses lxml's implementation of the ``ElementTree`` +* ``lxml`` - uses the :mod:`lxml.etree` implementation of the ``ElementTree`` API. The performance gains are relatively small compared to using the accelerated ``ElementTree`` module. @@ -31,21 +34,15 @@ You can specify the builder by name when using the shorthand API: with open("mydocument.html", "rb") as f: lxml_etree_document = html5lib.parse(f, treebuilder="lxml") -When instantiating a parser object, you have to pass a tree builder -class in the ``tree`` keyword attribute: +To get a builder class by name, use the :func:`~html5lib.treebuilders.getTreeBuilder` function. -.. code-block:: python - - import html5lib - parser = html5lib.HTMLParser(tree=SomeTreeBuilder) - document = parser.parse("

Hello World!") - -To get a builder class by name, use the ``getTreeBuilder`` function: +When instantiating a :class:`~html5lib.html5parser.HTMLParser` object, you must pass a tree builder class via the ``tree`` keyword attribute: .. code-block:: python import html5lib - parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) + TreeBuilder = html5lib.getTreeBuilder("dom") + parser = html5lib.HTMLParser(tree=TreeBuilder) minidom_document = parser.parse("

Hello World!") The implementation of builders can be found in `html5lib/treebuilders/ @@ -55,17 +52,16 @@ The implementation of builders can be found in `html5lib/treebuilders/ Tree walkers ------------ -Once a tree is ready, you can work on it either manually, or using -a tree walker, which provides a streaming view of the tree. html5lib -provides walkers for all three supported types of trees (``etree``, -``dom`` and ``lxml``). +In addition to manipulating a tree directly, you can use a tree walker to generate a streaming view of it. +html5lib provides walkers for ``etree``, ``dom``, and ``lxml`` trees, as well as ``genshi`` `markup streams `_. The implementation of walkers can be found in `html5lib/treewalkers/ `_. -Walkers make consuming HTML easier. html5lib uses them to provide you -with has a couple of handy tools. +html5lib provides a few tools for consuming token streams: +* :class:`~html5lib.serializer.HTMLSerializer`, to generate a stream of bytes; and +* filters, to manipulate the token stream. HTMLSerializer ~~~~~~~~~~~~~~ @@ -90,15 +86,14 @@ The serializer lets you write HTML back as a stream of bytes. '>' 'Witam wszystkich' -You can customize the serializer behaviour in a variety of ways, consult -the :class:`~html5lib.serializer.htmlserializer.HTMLSerializer` -documentation. +You can customize the serializer behaviour in a variety of ways. Consult +the :class:`~html5lib.serializer.HTMLSerializer` documentation. Filters ~~~~~~~ -You can alter the stream content with filters provided by html5lib: +html5lib provides several filters * :class:`alphabeticalattributes.Filter ` sorts attributes on @@ -110,11 +105,11 @@ You can alter the stream content with filters provided by html5lib: the document * :class:`lint.Filter ` raises - ``LintError`` exceptions on invalid tag and attribute names, invalid + :exc:`AssertionError` exceptions on invalid tag and attribute names, invalid PCDATA, etc. * :class:`optionaltags.Filter ` - removes tags from the stream which are not necessary to produce valid + removes tags from the token stream which are not necessary to produce valid HTML * :class:`sanitizer.Filter ` removes @@ -125,9 +120,9 @@ You can alter the stream content with filters provided by html5lib: * :class:`whitespace.Filter ` collapses all whitespace characters to single spaces unless they're in - ``

`` or ``textarea`` tags.
+  ``
`` or ``