Skip to content

Commit 06aac66

Browse files
committed
Documentation improvements and a reset method on the treebuiler so calling a single parser multiple times doesn't do funky things
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40414
1 parent 51260d7 commit 06aac66

File tree

3 files changed

+51
-8
lines changed

3 files changed

+51
-8
lines changed

src/__init__.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,34 @@
1+
"""
2+
HTML parsing library based on the WHATWG "HTML5"
3+
specification. The parser is designed to be compatible with existing
4+
HTML found in the wild and implements well-defined error recovery that
5+
is largely compatible with modern desktop web browsers.
6+
7+
Example usage:
8+
9+
import html5lib
10+
f = open("my_document.html")
11+
p = html5lib.HTMLParser()
12+
tree = p.parse(f)
13+
14+
By default the returned treeformat is a custom "simpletree", similar
15+
to a DOM tree; each element has attributes childNodes and parent
16+
holding the parents and children respectively, a name attribute
17+
holding the Element name, a data attribute holding the element data
18+
(for text and comment nodes) and an attributes dictionary holding the
19+
element's attributes (for Element nodes).
20+
21+
To get output in ElementTree format:
22+
23+
import html5lib
24+
from html5lib.treebuilders import etree
25+
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
26+
elementtree = p.parse(f)
27+
28+
Note: Because HTML documents support various features not in the
29+
default ElementTree (e.g. doctypes), we suppy our own simple
30+
serializer; html5lib.treebuilders.etree.write At present this does not
31+
have the encoding support offered by the elementtree serializer.
32+
33+
"""
134
from parser import HTMLParser

src/parser.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,17 @@
3232
from constants import headingElements, tableInsertModeElements
3333

3434
class HTMLParser(object):
35-
"""Main parser class"""
35+
"""HTML parser. Generates a tree structure from a stream of (possibly
36+
malformed) HTML"""
3637

3738
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
38-
"""HTML parser. Generates a tree structure from a stream of (possibly
39-
malformed) HTML.
39+
"""
4040
strict - raise an exception when a parse error is encountered
41+
4142
tree - a treebuilder class controlling the type of tree that will be
42-
returned (default - html5lib.simpletree.TreeBuilder)"""
43+
returned. This class is almost always a subclass of
44+
html5lib.treebuilders._base.TreeBuilder
45+
"""
4346

4447
# Raise an exception on the first error encountered
4548
self.strict = strict
@@ -72,11 +75,15 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
7275
self.lastPhase = None
7376

7477
def parse(self, stream, innerHTML=False):
75-
"""Stream should be a stream of unicode bytes. Character encoding
76-
issues have not yet been dealt with."""
77-
78-
# XXX - need to ensure the tree is reset here
78+
"""Parse a HTML document into a well-formed tree
7979
80+
stream - a filelike object or string containing the HTML to be parsed
81+
82+
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
83+
is not yet supported)
84+
"""
85+
86+
self.tree.reset()
8087

8188
# We don't actually support innerHTML yet but this should allow
8289
# assertations

src/treebuilders/_base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ class TreeBuilder(object):
112112
doctypeClass = None
113113

114114
def __init__(self):
115+
self.reset()
116+
117+
def reset(self):
115118
self.openElements = []
116119
self.activeFormattingElements = []
117120

0 commit comments

Comments
 (0)