Skip to content

Commit 9f8929b

Browse files
committed
Fix most of the lxml treebuilder errors. This code is still kindof frgile especilly in the fragment case
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401134
1 parent ca41c61 commit 9f8929b

File tree

1 file changed

+97
-6
lines changed

1 file changed

+97
-6
lines changed

src/html5lib/treewalkers/lxmletree.py

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,112 @@
77

88
from html5lib.constants import voidElements
99

10+
class Root(object):
11+
def __init__(self, et):
12+
self.elementtree = et
13+
self.children = []
14+
if et.docinfo.internalDTD:
15+
self.children.append(Doctype(self, et.docinfo.root_name,
16+
et.docinfo.public_id,
17+
et.docinfo.system_url))
18+
root = et.getroot()
19+
node = root
20+
21+
while node.getprevious() is not None:
22+
node = node.getprevious()
23+
while node is not None:
24+
self.children.append(node)
25+
node = node.getnext()
26+
27+
self.text = None
28+
self.tail = None
29+
30+
def __getitem__(self, key):
31+
return self.children[key]
32+
33+
def getnext(self):
34+
return None
35+
36+
class Doctype(object):
37+
def __init__(self, root_node, name, public_id, system_id):
38+
self.root_node = root_node
39+
self.name = name
40+
self.public_id = public_id
41+
self.system_id = system_id
42+
43+
self.text = None
44+
self.tail = None
45+
46+
def getnext(self):
47+
return self.root_node.children[1]
48+
49+
class FragmentRoot(Root):
50+
def __init__(self, children):
51+
self.children = [FragmentWrapper(self, child) for child in children]
52+
self.text = self.tail = None
53+
54+
def getnext(self):
55+
return None
56+
57+
class FragmentWrapper(object):
58+
def __init__(self, fragment_root, obj):
59+
self.root_node = fragment_root
60+
self.obj = obj
61+
if hasattr(self.obj, 'text'):
62+
self.text = self.obj.text
63+
else:
64+
self.text = None
65+
if hasattr(self.obj, 'tail'):
66+
self.tail = self.obj.tail
67+
else:
68+
self.tail = None
69+
self.isstring = isinstance(obj, basestring)
70+
71+
def __getattr__(self, name):
72+
return getattr(self.obj, name)
73+
74+
def getnext(self):
75+
siblings = self.root_node.children
76+
idx = siblings.index(self)
77+
if idx < len(siblings) - 1:
78+
return siblings[idx + 1]
79+
else:
80+
return None
81+
82+
def __getitem__(self, key):
83+
return self.obj[key]
84+
85+
def __nonzero__(self):
86+
return bool(self.obj)
87+
88+
def getparent(self):
89+
return None
90+
91+
def __str__(self):
92+
return str(self.obj)
93+
94+
1095
class TreeWalker(_base.NonRecursiveTreeWalker):
96+
def __init__(self, tree):
97+
if hasattr(tree, "getroot"):
98+
tree = Root(tree)
99+
elif isinstance(tree, list):
100+
tree = FragmentRoot(tree)
101+
_base.NonRecursiveTreeWalker.__init__(self, tree)
11102
def getNodeDetails(self, node):
12103
if isinstance(node, tuple): # Text node
13104
node, key = node
14105
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
15106
return _base.TEXT, getattr(node, key)
16107

17-
if not(hasattr(node, "tag")):
18-
node = node.getroot()
19-
20-
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
108+
elif isinstance(node, Root):
21109
return (_base.DOCUMENT,)
22110

23-
elif node.tag == "<!DOCTYPE>":
24-
return _base.DOCTYPE, node.text
111+
elif isinstance(node, Doctype):
112+
return _base.DOCTYPE, node.name, node.public_id, node.system_id
113+
114+
elif isinstance(node, FragmentWrapper) and node.isstring:
115+
return _base.TEXT, node
25116

26117
elif node.tag == etree.Comment:
27118
return _base.COMMENT, node.text

0 commit comments

Comments
 (0)