From 1be9532f79fd7744be0945c4ab42d2f5b41e4e73 Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:23:01 -0500 Subject: [PATCH 001/141] Added iframe seamless boolean attribute --- html5lib/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..659f2b5e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -535,6 +535,7 @@ "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), "output": frozenset(("disabled", "readonly")), + "iframe": frozenset(("seamless")), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It From 4dfe3cd9f97ce51c53463d633308f4a3fe6ad9e6 Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:25:04 -0500 Subject: [PATCH 002/141] Update CHANGES.rst --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 1431b3c9..89e48f94 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,7 +6,7 @@ Change Log Released on XXX, 2014 -* XXX +* Fix #XXX: added the seamless attribute for iframes. 0.999 From 7fd79e31e083ab75305b3e837ea9aa8c9b4675ff Mon Sep 17 00:00:00 2001 From: Ritwik Gupta Date: Mon, 24 Nov 2014 16:25:28 -0500 Subject: [PATCH 003/141] Update AUTHORS.rst --- AUTHORS.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..787c3b94 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,4 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Ritwik Gupta From ec674a97243e76da43f06abfd0a891308f1ff801 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 18 Feb 2017 17:24:23 +0000 Subject: [PATCH 004/141] Fix Travis (#319) There were two causes of breakage: - using pip 6.0.7, which doesn't properly install things in the right order - requirements-install.sh not properly handling the case where SIX_VERSION isn't defined --- .travis.yml | 2 +- requirements-install.sh | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 09ef5985..c5ffd833 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,7 +19,7 @@ env: - SIX_VERSION=1.9 USE_OPTIONAL=true install: - - bash requirements-install.sh + - ./requirements-install.sh script: - if [[ $TRAVIS_PYTHON_VERSION == pypy* ]]; then py.test; fi diff --git a/requirements-install.sh b/requirements-install.sh index 0be226a6..7ba9396f 100755 --- a/requirements-install.sh +++ b/requirements-install.sh @@ -1,9 +1,6 @@ -#!/bin/bash -e +#!/bin/bash -ex -if [[ $USE_OPTIONAL != "true" && $USE_OPTIONAL != "false" ]]; then - echo "fatal: \$USE_OPTIONAL not set to true or false. Exiting." - exit 1 -fi +pip install pip==6.1.0 pip install -U -r requirements-test.txt @@ -11,7 +8,7 @@ if [[ $USE_OPTIONAL == "true" ]]; then pip install -U -r requirements-optional.txt fi -if [[ $SIX_VERSION != "false" ]]; then +if [[ $SIX_VERSION ]]; then pip install six==$SIX_VERSION fi From 17499b9763a090f7715af49555d21fe4b558958b Mon Sep 17 00:00:00 2001 From: Benedikt Morbach Date: Sun, 19 Feb 2017 00:59:38 +0100 Subject: [PATCH 005/141] Avoid DeprecationWarnings on Python 3.6 (#318) Python 3.6 produces warnings on invalid escape sequences in strings, such as "\s", and they will be syntax errors in a future version of Python. See . --- html5lib/filters/sanitizer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index b5ddcb93..dc801668 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -782,7 +782,7 @@ def allowed_token(self, token): # characters, nor why we call unescape. I just know it's always been here. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all # this will do is remove *more* than it otherwise would. - val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '', + val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '', unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") @@ -807,7 +807,7 @@ def allowed_token(self, token): ' ', unescape(attrs[attr])) if (token["name"] in self.svg_allow_local_href and - (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*', + (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): del attrs[(namespaces['xlink'], 'href')] if (None, 'style') in attrs: @@ -837,16 +837,16 @@ def disallowed_token(self, token): def sanitize_css(self, style): # disallow urls - style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # gauntlet - if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): + if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' - if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] - for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style): + for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): if not value: continue if prop.lower() in self.allowed_css_properties: @@ -855,7 +855,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa + not re.match(r"^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') From 792fbdb17fd0b6c41eca1d1258e41394337e16a7 Mon Sep 17 00:00:00 2001 From: jonathan vanasco Date: Mon, 6 Mar 2017 12:06:33 -0500 Subject: [PATCH 006/141] added `itemscope` as a boolean attribute (issue #194) --- AUTHORS.rst | 1 + CHANGES.rst | 6 ++ html5lib/constants.py | 2 +- .../tests/serializer-testdata/options.test | 69 +++++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index c3820ef7..5623e03a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -42,3 +42,4 @@ Patches and suggestions - Michael[tm] Smith - Marc Abramowitz - Jon Dufresne +- Jonathan Vanasco diff --git a/CHANGES.rst b/CHANGES.rst index 570c9605..f2591d2f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Change Log ---------- +unreleased +~~~~~~~~~~~~~~~~~~ + +* Added `itemscope` as boolean attribute + https://github.com/html5lib/html5lib-python/issues/194 + 0.999999999/1.0b10 ~~~~~~~~~~~~~~~~~~ diff --git a/html5lib/constants.py b/html5lib/constants.py index 9e7541d3..975aa021 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -588,7 +588,7 @@ ]) booleanAttributes = { - "": frozenset(["irrelevant"]), + "": frozenset(["irrelevant", "itemscope"]), "style": frozenset(["scoped"]), "img": frozenset(["ismap"]), "audio": frozenset(["autoplay", "controls"]), diff --git a/html5lib/tests/serializer-testdata/options.test b/html5lib/tests/serializer-testdata/options.test index eedcb3f0..a22eebfc 100644 --- a/html5lib/tests/serializer-testdata/options.test +++ b/html5lib/tests/serializer-testdata/options.test @@ -46,6 +46,29 @@ "quote_attr_values": "always" } }, + { + "expected": [ + "
" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "div", + [ + { + "namespace": null, + "name": "itemscope", + "value": "itemscope" + } + ] + ] + ], + "description": "quote_attr_values='always' with itemscope", + "options": { + "quote_attr_values": "always" + } + }, { "expected": [ "
" @@ -171,6 +194,29 @@ "use_trailing_solidus": true } }, + { + "expected": [ + "
" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "div", + [ + { + "namespace": null, + "name": "itemscope", + "value": "itemscope" + } + ] + ] + ], + "description": "minimize_boolean_attributes=false", + "options": { + "minimize_boolean_attributes": false + } + }, { "expected": [ "
" @@ -194,6 +240,29 @@ "minimize_boolean_attributes": false } }, + { + "expected": [ + "
" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "div", + [ + { + "namespace": null, + "name": "itemscope", + "value": "" + } + ] + ] + ], + "description": "minimize_boolean_attributes=false with empty value", + "options": { + "minimize_boolean_attributes": false + } + }, { "expected": [ "
" From 224d9f4e243f6645e88b32ad7342a55128f19eeb Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 08:22:35 -0700 Subject: [PATCH 007/141] Fix formatting of docstring example It runs together in the built HTML. --- html5lib/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index f3cd9455..e615ffae 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -4,11 +4,11 @@ HTML found in the wild and implements well-defined error recovery that is largely compatible with modern desktop web browsers. -Example usage: +Example usage:: -import html5lib -f = open("my_document.html") -tree = html5lib.parse(f) + import html5lib + f = open("my_document.html") + tree = html5lib.parse(f) """ from __future__ import absolute_import, division, unicode_literals From 3fb6af3f78d1cc95b25b71306a2d82b0c12f7996 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 08:24:30 -0700 Subject: [PATCH 008/141] Use with, it's idiomatic --- html5lib/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index e615ffae..b4639bde 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -7,8 +7,8 @@ Example usage:: import html5lib - f = open("my_document.html") - tree = html5lib.parse(f) + with open("my_document.html") as f: + tree = html5lib.parse(f) """ from __future__ import absolute_import, division, unicode_literals From ba63e09ed9421b3aecf7ac97b0e943f2f74e1825 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:27:34 -0700 Subject: [PATCH 009/141] Fix typo in changelog --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 570c9605..ce2d2c5b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -63,7 +63,7 @@ Released on July 14, 2016 to clarify their status as public.** * **Get rid of the sanitizer package. Merge sanitizer.sanitize into the - sanitizer.htmlsanitizer module and move that to saniziter. This means + sanitizer.htmlsanitizer module and move that to sanitizer. This means anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no code changes.** From 6b99d52ad6867ee6262ddfcac42a501be54685c7 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:28:17 -0700 Subject: [PATCH 010/141] Export and document html5lib.__version__ It's not much use if it's private. --- html5lib/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index b4639bde..f202d5b3 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -19,7 +19,8 @@ from .serializer import serialize __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", - "getTreeWalker", "serialize"] + "getTreeWalker", "serialize", "__version__"] # this has to be at the top level, see how setup.py parses this +#: Distribution version number, which asymptotically approaches 1. __version__ = "0.9999999999-dev" From 323d736ca03bbd2535987eed9b55f6f000f5fba5 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:28:38 -0700 Subject: [PATCH 011/141] Add a documentation env to tox.ini Run "tox -e doc" to build the documentation in doc/_build. --- tox.ini | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index da64de71..830999cd 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional} +envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional},doc [testenv] deps = @@ -11,7 +11,12 @@ deps = base: webencodings py26-base: ordereddict optional: -r{toxinidir}/requirements-optional.txt + doc: Sphinx commands = {envbindir}/py.test {toxinidir}/flake8-run.sh + +[testenv:doc] +changedir = doc +commands = sphinx-build -b html . _build From 964d0e166ac49062c6cbabc1263f624332e3afec Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:31:56 -0700 Subject: [PATCH 012/141] Clean up html5lib module documentation Right now the docs have entries for re-exports like html5lib.__init__.HTMLParser, including full class documentation. This is redundant with the docs for html5lib.html5parser.HTMLParser, which is a public name anyway, so I think that it is best to be explicit that this is a re-export. --- doc/html5lib.rst | 6 ++---- html5lib/__init__.py | 7 +++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/html5lib.rst b/doc/html5lib.rst index f0646aac..22af7728 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -4,10 +4,8 @@ html5lib Package :mod:`html5lib` Package ----------------------- -.. automodule:: html5lib.__init__ - :members: - :undoc-members: - :show-inheritance: +.. automodule:: html5lib + :members: __version__ :mod:`constants` Module ----------------------- diff --git a/html5lib/__init__.py b/html5lib/__init__.py index f202d5b3..745b9342 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -9,6 +9,13 @@ import html5lib with open("my_document.html") as f: tree = html5lib.parse(f) + +For convenience, this module re-exports the following names: + +* :func:`~.html5parser.parse`, :func:`~.html5parser.parseFragment`, and :class:`~.html5parser.HTMLParser` +* :func:`~.treebuilders.getTreeBuilder` +* :func:`~.treewalkers.getTreeWalker` +* :func:`~.serializer.serialize` """ from __future__ import absolute_import, division, unicode_literals From abf622432db7169f2af7ecc71980b923643f2711 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:36:43 -0700 Subject: [PATCH 013/141] Remove docs for HTMLTokenizer and HTMLSanitizer HTMLTokenizer is now a private API (I cannot find a public export). HTMLSanitizer no longer exists as a tokenizer, and has been replaced with a filter. --- doc/movingparts.rst | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 80ee2ad1..3eeff4f2 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -169,41 +169,3 @@ the following way: * If all else fails, the default encoding will be used. This is usually `Windows-1252 `_, which is a common fallback used by Web browsers. - - -Tokenizers ----------- - -The part of the parser responsible for translating a raw input stream -into meaningful tokens is the tokenizer. Currently html5lib provides -two. - -To set up a tokenizer, simply pass it when instantiating -a :class:`~html5lib.html5parser.HTMLParser`: - -.. code-block:: python - - import html5lib - from html5lib import sanitizer - - p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) - p.parse("

Surprise!") - -HTMLTokenizer -~~~~~~~~~~~~~ - -This is the default tokenizer, the heart of html5lib. The implementation -can be found in `html5lib/tokenizer.py -`_. - -HTMLSanitizer -~~~~~~~~~~~~~ - -This is a tokenizer that removes unsafe markup and CSS styles from the -input. Elements that are known to be safe are passed through and the -rest is converted to visible text. The default configuration of the -sanitizer follows the `WHATWG Sanitization Rules -`_. - -The implementation can be found in `html5lib/sanitizer.py -`_. From 85540983f6285c82f2a1c4a8d756ae58d0c1e713 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:40:10 -0700 Subject: [PATCH 014/141] Fix Sphinx title underline warnings --- doc/html5lib.rst | 2 +- doc/html5lib.treewalkers.rst | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/html5lib.rst b/doc/html5lib.rst index 22af7728..44e34573 100644 --- a/doc/html5lib.rst +++ b/doc/html5lib.rst @@ -24,7 +24,7 @@ html5lib Package :show-inheritance: :mod:`serializer` Module ----------------------- +------------------------ .. automodule:: html5lib.serializer :members: diff --git a/doc/html5lib.treewalkers.rst b/doc/html5lib.treewalkers.rst index 46501258..085d8a98 100644 --- a/doc/html5lib.treewalkers.rst +++ b/doc/html5lib.treewalkers.rst @@ -10,7 +10,7 @@ treewalkers Package :show-inheritance: :mod:`base` Module -------------------- +------------------ .. automodule:: html5lib.treewalkers.base :members: @@ -34,7 +34,7 @@ treewalkers Package :show-inheritance: :mod:`etree_lxml` Module ------------------------ +------------------------ .. automodule:: html5lib.treewalkers.etree_lxml :members: @@ -43,9 +43,9 @@ treewalkers Package :mod:`genshi` Module --------------------------- +-------------------- .. automodule:: html5lib.treewalkers.genshi :members: :undoc-members: - :show-inheritance: \ No newline at end of file + :show-inheritance: From c8fca0ecc7c704995947601e03da0c34a85ecdf5 Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 09:50:52 -0700 Subject: [PATCH 015/141] Open in binary mode for Python 3 --- html5lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 745b9342..b1970d29 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -7,7 +7,7 @@ Example usage:: import html5lib - with open("my_document.html") as f: + with open("my_document.html", "rb") as f: tree = html5lib.parse(f) For convenience, this module re-exports the following names: From 637826ffa72ca982dff6ae7204e4afcc35f3e29e Mon Sep 17 00:00:00 2001 From: Tom Most Date: Sat, 15 Apr 2017 11:51:16 -0700 Subject: [PATCH 016/141] Update and expand "moving parts" doc --- doc/movingparts.rst | 65 +++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/doc/movingparts.rst b/doc/movingparts.rst index 3eeff4f2..1f3086cb 100644 --- a/doc/movingparts.rst +++ b/doc/movingparts.rst @@ -4,22 +4,25 @@ The moving parts html5lib consists of a number of components, which are responsible for handling its features. +Parsing uses a *tree builder* to generate a *tree*, the in-memory representation of the document. +Several tree representations are supported, as are translations to other formats via *tree adapters*. +The tree may be translated to a token stream with a *tree walker*, from which :class:`~html5lib.serializer.HTMLSerializer` produces a stream of bytes. +The token stream may also be transformed by use of *filters* to accomplish tasks like sanitization. Tree builders ------------- The parser reads HTML by tokenizing the content and building a tree that -the user can later access. There are three main types of trees that -html5lib can build: +the user can later access. html5lib can build three types of trees: -* ``etree`` - this is the default; builds a tree based on ``xml.etree``, +* ``etree`` - this is the default; builds a tree based on :mod:`xml.etree`, which can be found in the standard library. Whenever possible, the accelerated ``ElementTree`` implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x) is used. -* ``dom`` - builds a tree based on ``xml.dom.minidom``. +* ``dom`` - builds a tree based on :mod:`xml.dom.minidom`. -* ``lxml.etree`` - uses lxml's implementation of the ``ElementTree`` +* ``lxml`` - uses the :mod:`lxml.etree` implementation of the ``ElementTree`` API. The performance gains are relatively small compared to using the accelerated ``ElementTree`` module. @@ -31,21 +34,15 @@ You can specify the builder by name when using the shorthand API: with open("mydocument.html", "rb") as f: lxml_etree_document = html5lib.parse(f, treebuilder="lxml") -When instantiating a parser object, you have to pass a tree builder -class in the ``tree`` keyword attribute: +To get a builder class by name, use the :func:`~html5lib.treebuilders.getTreeBuilder` function. -.. code-block:: python - - import html5lib - parser = html5lib.HTMLParser(tree=SomeTreeBuilder) - document = parser.parse("

Hello World!") - -To get a builder class by name, use the ``getTreeBuilder`` function: +When instantiating a :class:`~html5lib.html5parser.HTMLParser` object, you must pass a tree builder class via the ``tree`` keyword attribute: .. code-block:: python import html5lib - parser = html5lib.HTMLParser(tree=html5lib.getTreeBuilder("dom")) + TreeBuilder = html5lib.getTreeBuilder("dom") + parser = html5lib.HTMLParser(tree=TreeBuilder) minidom_document = parser.parse("

Hello World!") The implementation of builders can be found in `html5lib/treebuilders/ @@ -55,17 +52,16 @@ The implementation of builders can be found in `html5lib/treebuilders/ Tree walkers ------------ -Once a tree is ready, you can work on it either manually, or using -a tree walker, which provides a streaming view of the tree. html5lib -provides walkers for all three supported types of trees (``etree``, -``dom`` and ``lxml``). +In addition to manipulating a tree directly, you can use a tree walker to generate a streaming view of it. +html5lib provides walkers for ``etree``, ``dom``, and ``lxml`` trees, as well as ``genshi`` `markup streams `_. The implementation of walkers can be found in `html5lib/treewalkers/ `_. -Walkers make consuming HTML easier. html5lib uses them to provide you -with has a couple of handy tools. +html5lib provides a few tools for consuming token streams: +* :class:`~html5lib.serializer.HTMLSerializer`, to generate a stream of bytes; and +* filters, to manipulate the token stream. HTMLSerializer ~~~~~~~~~~~~~~ @@ -90,15 +86,14 @@ The serializer lets you write HTML back as a stream of bytes. '>' 'Witam wszystkich' -You can customize the serializer behaviour in a variety of ways, consult -the :class:`~html5lib.serializer.htmlserializer.HTMLSerializer` -documentation. +You can customize the serializer behaviour in a variety of ways. Consult +the :class:`~html5lib.serializer.HTMLSerializer` documentation. Filters ~~~~~~~ -You can alter the stream content with filters provided by html5lib: +html5lib provides several filters * :class:`alphabeticalattributes.Filter ` sorts attributes on @@ -110,11 +105,11 @@ You can alter the stream content with filters provided by html5lib: the document * :class:`lint.Filter ` raises - ``LintError`` exceptions on invalid tag and attribute names, invalid + :exc:`AssertionError` exceptions on invalid tag and attribute names, invalid PCDATA, etc. * :class:`optionaltags.Filter ` - removes tags from the stream which are not necessary to produce valid + removes tags from the token stream which are not necessary to produce valid HTML * :class:`sanitizer.Filter ` removes @@ -125,9 +120,9 @@ You can alter the stream content with filters provided by html5lib: * :class:`whitespace.Filter ` collapses all whitespace characters to single spaces unless they're in - ``

`` or ``textarea`` tags.
+  ``
`` or ``