Skip to content

Commit 55d183d

Browse files
committed
New implementation of the elementtree treewalker that doesn't fail with character data immediatley before the root node
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401008
1 parent 8ae506e commit 55d183d

File tree

2 files changed

+64
-53
lines changed

2 files changed

+64
-53
lines changed

src/html5lib/treewalkers/etree.py

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,22 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
2828
to avoid using recursion, returns "nodes" as tuples with the following
2929
content:
3030
31-
1. An Element node serving as *context* (it cannot be called the parent
32-
node due to the particular ``tail`` text nodes.
33-
34-
2. Either the string literals ``"text"`` or ``"tail"`` or a child index
35-
36-
3. A list used as a stack of all ancestor *context nodes*. It is a
37-
pair tuple whose first item is an Element and second item is a child
38-
index.
31+
1. The current element
32+
33+
2. The index of the element relative to its parent
34+
35+
3. A stack of ancestor elements
36+
37+
4. A flag "text", "tail" or None to indicate if the current node is a
38+
text node; either the text or tail of the current element (1)
3939
"""
40-
41-
def getNodeDetails(self, node):
40+
def getNodeDetails(self, node):
4241
if isinstance(node, tuple): # It might be the root Element
43-
elt, key, parents = node
44-
if key in ("text", "tail"):
45-
return _base.TEXT, getattr(elt, key)
42+
elt, key, parents, flag = node
43+
if flag in ("text", "tail"):
44+
return _base.TEXT, getattr(elt, flag)
4645
else:
47-
node = elt[int(key)]
46+
node = elt
4847

4948
if not(hasattr(node, "tag")):
5049
node = node.getroot()
@@ -61,54 +60,60 @@ def getNodeDetails(self, node):
6160
else:
6261
#This is assumed to be an ordinary element
6362
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
64-
63+
6564
def getFirstChild(self, node):
66-
if isinstance(node, tuple): # It might be the root Element
67-
elt, key, parents = node
68-
assert key not in ("text", "tail"), "Text nodes have no children"
69-
parents.append((elt, int(key)))
70-
node = elt[int(key)]
65+
if isinstance(node, tuple):
66+
element, key, parents, flag = node
7167
else:
72-
parents = []
73-
74-
assert len(node) or node.text, "Node has no children"
75-
if node.text:
76-
return (node, "text", parents)
68+
element, key, parents, flag = node, None, [], None
69+
70+
if flag in ("text", "tail"):
71+
return None
7772
else:
78-
return (node, 0, parents)
79-
73+
if element.text:
74+
return element, key, parents, "text"
75+
elif len(element):
76+
parents.append(element)
77+
return element[0], 0, parents, None
78+
else:
79+
return None
80+
8081
def getNextSibling(self, node):
8182
if isinstance(node, tuple):
82-
elt, key, parents = node
83-
if key == "text":
84-
key = -1
85-
elif key == "tail":
86-
elt, key = parents.pop()
87-
else:
88-
# Look for "tail" of the "revisited" node
89-
child = elt[key]
90-
if child.tail:
91-
parents.append((elt, key))
92-
return (child, "tail", parents)
83+
element, key, parents, flag = node
9384
else:
9485
return None
95-
96-
# case where key were "text" or "tail" or elt[key] had a tail
97-
key += 1
98-
if len(elt) > key:
99-
return (elt, key, parents)
86+
87+
if flag == "text":
88+
if len(element):
89+
parents.append(element)
90+
return element[0], 0, parents, None
91+
else:
92+
return None
10093
else:
101-
return None
102-
94+
if element.tail and flag != "tail":
95+
return element, key, parents, "tail"
96+
elif key < len(parents[-1]) - 1:
97+
return parents[-1][key+1], key+1, parents, None
98+
else:
99+
return None
100+
103101
def getParentNode(self, node):
104102
if isinstance(node, tuple):
105-
elt, key, parents = node
106-
if parents:
107-
elt, key = parents.pop()
108-
return elt, key, parents
109-
else:
110-
return elt
103+
element, key, parents, flag = node
111104
else:
112105
return None
106+
107+
if flag == "text":
108+
if not parents:
109+
return element
110+
else:
111+
return element, key, parents, None
112+
else:
113+
parent = parents.pop()
114+
if not parents:
115+
return parent
116+
else:
117+
return parent, list(parents[-1]).index(parent), parents, None
113118

114119
return locals()

tests/test_treewalkers.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,11 +226,17 @@ def test_all_tokens(self):
226226
{'data': [], 'type': 'StartTag', 'name': u'head'},
227227
{'data': [], 'type': 'EndTag', 'name': u'head'},
228228
{'data': [], 'type': 'StartTag', 'name': u'body'},
229+
{'data': u'a', 'type': 'Characters'},
230+
{'data': [], 'type': 'StartTag', 'name': u'div'},
231+
{'data': u'b', 'type': 'Characters'},
232+
{'data': [], 'type': 'EndTag', 'name': u'div'},
233+
{'data': u'c', 'type': 'Characters'},
229234
{'data': [], 'type': 'EndTag', 'name': u'body'},
230-
{'data': [], 'type': 'EndTag', 'name': u'html'}]
235+
{'data': [], 'type': 'EndTag', 'name': u'html'}
236+
]
231237
for treeName, treeCls in treeTypes.iteritems():
232238
p = html5parser.HTMLParser(tree = treeCls["builder"])
233-
document = p.parse("<html></html>")
239+
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
234240
document = treeCls.get("adapter", lambda x: x)(document)
235241
output = treeCls["walker"](document)
236242
for expectedToken, outputToken in zip(expected, output):

0 commit comments

Comments
 (0)