Skip to content

Commit baf166b

Browse files
committed
Move DOMlite to simpletree and add some more treebuilder documentation
--HG-- rename : src/treebuilders/DOMlite.py => src/treebuilders/simpletree.py extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40403
1 parent 40b9e90 commit baf166b

File tree

5 files changed

+119
-17
lines changed

5 files changed

+119
-17
lines changed

parse.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
"""usage: %prog [options] filename
33
4-
Parse a document to a DOMlite tree, with optional profiling
4+
Parse a document to a simpletree tree, with optional profiling
55
"""
66

77
import sys
@@ -40,10 +40,10 @@ def parse():
4040
print "Treebuilder %s not found"%opts.treebuilder
4141
raise
4242
except:
43-
treebuilder = treebuilders.DOMlite.TreeBuilder
43+
treebuilder = treebuilders.simpletree.TreeBuilder
4444
else:
45-
import treebuilders.DOMlite
46-
treebuilder = treebuilders.DOMlite
45+
import treebuilders.simpletree
46+
treebuilder = treebuilders.simpletree
4747

4848
p = parser.HTMLParser(tree=treebuilder)
4949

src/treebuilders/__init__.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,31 @@
1-
# This file is intentionally left blank
1+
"""A collection of modules for building different kinds of tree from
2+
HTML documents.
3+
4+
To create a treebuilder for a new type of tree, you need to do
5+
implement several things:
6+
7+
1) A set of classes for various types of elements: Document, Doctype,
8+
Comment, Element. These must implement the interface of
9+
_base.treebuilders.Node (although comment nodes have a different
10+
signature for their constructor, see treebuilders.simpletree.Comment)
11+
Textual content may also be implemented as another node type, or not, as
12+
your tree implementation requires.
13+
14+
2) A treebuilder object (called TreeBuilder by convention) that
15+
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
16+
documentClass - the class to use for the bottommost node of a document
17+
elementClass - the class to use for HTML Elements
18+
commentClass - the class to use for comments
19+
doctypeClass - the class to use for doctypes
20+
It also has one required method:
21+
getDocument - Returns the root node of the complete document tree
22+
23+
3) If you wish to run the unit tests, you must also create a
24+
testSerializer method on your treebuilder which accepts a node and
25+
returns a string containing Node and its children serialized according
26+
to the format used in the unittests
27+
28+
The supplied simpletree module provides a python-only implementation
29+
of a full treebuilder and is a useful reference for the semantics of
30+
the various methods.
31+
"""

src/treebuilders/_base.py

Lines changed: 82 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,84 @@
2020
#XXX - TODO; make the default interface more ElementTree-like
2121
# rather than DOM-like
2222

23+
class Node(object):
24+
def __init__(self, name):
25+
"""Node representing an item in the tree.
26+
name - The tag name associated with the node
27+
parent - The parent of the current node (or None for the root node)
28+
value - The value of the current node (applies to text nodes and
29+
comments
30+
attributes - a dict holding name, value pairs for attributes of the node
31+
childNodes - a list of child nodes of the current node. This must
32+
include all elements but not necessarily other node types
33+
_flags - A list of miscellaneous flags that can be set on the node
34+
"""
35+
self.name = name
36+
self.parent = None
37+
self.value = None
38+
self.attributes = {}
39+
self.childNodes = []
40+
self._flags = []
41+
42+
def __str__(self):
43+
attributesStr = " ".join(["%s=\"%s\""%(name, value)
44+
for name, value in
45+
self.attributes.iteritems()])
46+
if attributesStr:
47+
return "<%s %s>"%(self.name,attributesStr)
48+
else:
49+
return "<%s>"%(self.name)
50+
51+
def __repr__(self):
52+
return "<%s %s>" % (self.__class__, self.name)
53+
54+
def appendChild(self, node):
55+
"""Insert node as a child of the current node"""
56+
raise NotImplementedError
57+
58+
def insertText(self, data, insertBefore=None):
59+
"""Insert data as text in the current node, positioned before the
60+
start of node insertBefore or to the end of the node's text.
61+
"""
62+
raise NotImplementedError
63+
64+
def insertBefore(self, node, refNode):
65+
"""Insert node as a child of the current node, before refNode in the
66+
list of child nodes. Raises ValueError if refNode is not a child of
67+
the current node"""
68+
raise NotImplementedError
69+
70+
def removeChild(self, node):
71+
"""Remove node from the children of the current node"""
72+
raise NotImplementedError
73+
74+
def reparentChildren(self, newParent):
75+
"""Move all the children of the current node to newParent.
76+
This is needed so that trees that don't store text as nodes move the
77+
text in the correct way"""
78+
#XXX - should this method be made more general?
79+
for child in self.childNodes:
80+
newParent.appendChild(child)
81+
self.childNodes = []
82+
83+
def cloneNode(self):
84+
"""Return a shallow copy of the current node i.e. a node with the same
85+
name and attributes but with no parent or child nodes"""
86+
raise NotImplementedError
87+
88+
89+
def hasContent(self):
90+
"""Return true if the node has children or text, false otherwise
91+
"""
92+
raise NotImplementedError
93+
2394
class TreeBuilder(object):
24-
"""Base treebuilder implementation"""
95+
"""Base treebuilder implementation
96+
documentClass - the class to use for the bottommost node of a document
97+
elementClass - the class to use for HTML Elements
98+
commentClass - the class to use for comments
99+
doctypeClass - the class to use for doctypes
100+
"""
25101

26102
#Document class
27103
documentClass = None
@@ -231,15 +307,11 @@ def generateImpliedEndTags(self, exclude=None):
231307
# self.processEndTag(name)
232308
self.generateImpliedEndTags(exclude)
233309

234-
def reparentChildren(self, oldParent, newParent):
235-
"""Move all the children of oldParent to newParent. This is needed do
236-
that trees that don't store text as nodes move the text in the correct
237-
way"""
238-
#XXX - should this method be made more general?
239-
for child in oldParent.childNodes:
240-
newParent.appendChild(child)
241-
oldParent.childNodes = []
242-
243310
def getDocument(self):
244311
"Return the final tree"
245312
return self.document
313+
314+
def testSerializer(self, node):
315+
"""Serialize the subtree of node in the format required by unit tests
316+
node - the node from which to start serializing"""
317+
raise NotImplementedError
File renamed without changes.

tests/test_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
import parser
1313
#Run tests over all treebuilders
1414
#XXX - it would be nice to automate finding all treebuilders or to allow running just one
15-
from treebuilders import DOMlite, etree
15+
from treebuilders import simpletree, etree
1616

17-
treetypes = {"DOMlite":DOMlite.TreeBuilder,
17+
treetypes = {"simpletree":simpletree.TreeBuilder,
1818
"ElementTree":etree.TreeBuilder}
1919

2020
def parseTestcase(testString):

0 commit comments

Comments
 (0)