Skip to content

Commit e5788d7

Browse files
committed
A few minor fixes; fix up etree serialization a bit, allow a single parser instance to parse multiple documents
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40417
1 parent c0e75ea commit e5788d7

File tree

4 files changed

+17
-13
lines changed

4 files changed

+17
-13
lines changed

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
license="MIT License",
66
description='HTML parser based on the WHAT-WG Web Applications 1.0'
77
'("HTML5") specifcation',
8-
packages=['html5lib'],
9-
package_dir = {'html5lib': 'src'}
8+
packages=['html5lib', 'html5lib.treebuilders'],
9+
package_dir = {'html5lib': 'src',
10+
'html5lib.treebuilders': 'src/treebuilders'}
1011
)

src/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
2828
Note: Because HTML documents support various features not in the
2929
default ElementTree (e.g. doctypes), we suppy our own simple
30-
serializer; html5lib.treebuilders.etree.write At present this does not
30+
serializer; html5lib.treebuilders.etree.tostring At present this does not
3131
have the encoding support offered by the elementtree serializer.
3232
3333
"""

src/parser.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
4646

4747
# Raise an exception on the first error encountered
4848
self.strict = strict
49-
self.errors = []
5049

5150
self.tree = tree()
51+
self.errors = []
5252

5353
self.phases = {
5454
"initial": InitialPhase(self, self.tree),
@@ -69,10 +69,6 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
6969
"afterFrameset": AfterFramesetPhase(self, self.tree),
7070
"trailingEnd": TrailingEndPhase(self, self.tree)
7171
}
72-
self.phase = self.phases["initial"]
73-
# We only seem to have InBodyPhase testcases where the following is
74-
# relevant ... need others too
75-
self.lastPhase = None
7672

7773
def parse(self, stream, innerHTML=False):
7874
"""Parse a HTML document into a well-formed tree
@@ -84,6 +80,12 @@ def parse(self, stream, innerHTML=False):
8480
"""
8581

8682
self.tree.reset()
83+
self.errors = []
84+
85+
self.phase = self.phases["initial"]
86+
# We only seem to have InBodyPhase testcases where the following is
87+
# relevant ... need others too
88+
self.lastPhase = None
8789

8890
# We don't actually support innerHTML yet but this should allow
8991
# assertations

src/treebuilders/etree.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,13 +151,13 @@ def serializeElement(element, indent=0):
151151

152152
return "\n".join(rv)
153153

154-
def write(element):
154+
def tostring(element):
155155
"""Serialize an element and its child nodes to a string"""
156156
rv = []
157157
finalText = None
158158
def serializeElement(element):
159159
if element.tag is DocumentType:
160-
rv.append("<!DOCTYPE %s>\n"%(element.text,))
160+
rv.append("<!DOCTYPE %s>"%(element.text,))
161161
elif element.tag is Document:
162162
if element.text:
163163
rv.append(element.text)
@@ -168,8 +168,9 @@ def serializeElement(element):
168168
serializeElement(child)
169169

170170
elif element.tag is Comment:
171-
rv.append("<!-- %s -->\n"%(element.text,))
171+
rv.append("<!--%s-->"%(element.text,))
172172
else:
173+
#This is assumed to be an ordinary element
173174
if not element.attrib:
174175
rv.append("<%s>"%(element.tag,))
175176
else:
@@ -182,10 +183,10 @@ def serializeElement(element):
182183
for child in element.getchildren():
183184
serializeElement(child)
184185

185-
rv.append("</%s>\n"%(element.tag,))
186+
rv.append("</%s>"%(element.tag,))
186187

187188
if element.tail:
188-
rv.append(element.tail + "\n")
189+
rv.append(element.tail)
189190

190191
serializeElement(element)
191192

0 commit comments

Comments
 (0)