Skip to content

Commit 36517ca

Browse files
committed
Initial elementtree support (not yet unit-tested), and changes to parse.py to support multiple treebuilders
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40387
1 parent ecf2ab0 commit 36517ca

File tree

3 files changed

+47
-15
lines changed

3 files changed

+47
-15
lines changed

parse.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#!/usr/bin/env python
12
"""usage: %prog [options] filename
23
34
Parse a document to a DOMlite tree, with optional profiling
@@ -7,7 +8,7 @@
78
import os
89
from optparse import OptionParser
910

10-
from src import parser
11+
from src import parser, treebuilders
1112

1213
def convertTreeDump(treedump):
1314
"""convert the output of str(document) to something more readable
@@ -25,10 +26,23 @@ def parse():
2526
optParser = getOptParser()
2627
opts,args = optParser.parse_args()
2728

28-
p = parser.HTMLParser()
29-
# Don't try to open args[0]. It should be possible to pass a string or file
30-
# reference. HTMLInputStream takes care of the difference.
31-
f = args[0]
29+
try:
30+
f = args[-1]
31+
except IndexError:
32+
print "No filename provided. Use -h for help"
33+
sys.exit(1)
34+
if hasattr(opts, "treebuilder"):
35+
try:
36+
#This isn't a great way to do this
37+
exec("import treebuilders.%s")%opts.treebuilder.split(".")[0]
38+
treebuilder = eval("treebuilders.%s"%opts.treebuilder)
39+
except NameError:
40+
print "Treebuilder %s not found"%opts.treebuilder
41+
raise
42+
else:
43+
treebuilder = treebuilders.DOMlite.TreeBuilder
44+
p = parser.HTMLParser(tree=treebuilder)
45+
3246
if opts.profile:
3347
import hotshot
3448
import hotshot.stats
@@ -45,25 +59,28 @@ def parse():
4559
t0 = time.time()
4660
document = p.parse(f)
4761
t1 = time.time()
48-
print convertTreeDump(document.printTree())
62+
print p.tree.testSerializer(document)
4963
t2 = time.time()
5064
print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
5165
else:
5266
document = p.parse(f)
53-
print convertTreeDump(document.printTree())
67+
print p.tree.testSerializer(document)
5468
print "\nParse errors:\n" + "\n".join(p.errors)
5569

5670
def getOptParser():
5771
parser = OptionParser(usage=__doc__)
5872

5973
parser.add_option("-p", "--profile", action="store_true", default=False,
60-
dest="profile", help="Use the hotdhot profiler to "
74+
dest="profile", help="Use the hotshot profiler to "
6175
"produce a detailed log of the run")
6276

6377
parser.add_option("-t", "--time",
6478
action="store_true", default=False, dest="time",
6579
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
6680

81+
parser.add_option("-b", "--treebuilder", action="store", type="string",
82+
dest="treebuilder")
83+
6784
return parser
6885

6986
if __name__ == "__main__":

src/treebuilders/DOMlite.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,5 @@ class TreeBuilder(base.TreeBuilder):
119119
elementClass = Element
120120
commentClass = CommentNode
121121

122+
def testSerializer(self, node):
123+
node.printTree()

src/treebuilders/etree.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,15 @@
77

88
class Element(object):
99
def __init__(self, name):
10-
self._element = ElementTree.Element()
10+
self._element = ElementTree.Element(name)
1111
self.name = name
1212
self.parent = None
13+
self._flags = []
14+
15+
#Set the element text and tail to the empty string rather than None
16+
#XXX - is this desirable or should we do it on a case by case basis?
17+
self._element.text = ""
18+
self._element.tail = ""
1319

1420
def _setName(self, name):
1521
self._element.tag = name
@@ -28,7 +34,7 @@ def _setAttributes(self, attributes):
2834
for key in self._element.attrib.keys():
2935
del self._element.attrib[key]
3036
for key, value in attributes.iteritems():
31-
self._elements.set(key, value)
37+
self._element.set(key, value)
3238

3339
attributes = property(_getAttributes, _setAttributes)
3440

@@ -37,23 +43,23 @@ def appendChild(self, node):
3743
node.parent = self
3844

3945
def insertBefore(self, node, refNode):
40-
index = self._element.getChildren().index(refNode._element)
46+
index = self._element.getchildren().index(refNode._element)
4147
self._element.insert(index, node._element)
4248
node.parent = self
4349

4450
def removeChild(self, node):
4551
self._element.remove(node._element)
4652
node.parent=None
4753

48-
def insertText(self, text, insertBefore):
54+
def insertText(self, data, insertBefore=None):
4955
if not(len(self._element)):
5056
self._element.text += data
5157
elif insertBefore is None:
5258
#Insert the text as the tail of the last child element
5359
self._element[-1].tail += data
5460
else:
5561
#Insert the text before the specified node
56-
children = self._element.getChildren()
62+
children = self._element.getchildren()
5763
index = children.index(insertBefore._element)
5864
if index > 0:
5965
self._element[index-1].tail += data
@@ -83,11 +89,18 @@ def _setData(self, value):
8389
class DocumentType(Element):
8490
pass
8591

92+
class Document(Element):
93+
def __init__(self):
94+
Element.__init__(self, "")
95+
8696
class TreeBuilder(base.TreeBuilder):
87-
documentClass = Element
97+
documentClass = Document
8898
doctypeClass = DocumentType
8999
elementClass = Element
90100
commentClass = Comment
91101

102+
def testSerializer(self, element):
103+
ElementTree.tostring(element)
104+
92105
def getDocument(self):
93-
return self.document._element
106+
return self.document.getchildren()[0]._element

0 commit comments

Comments
 (0)