Skip to content

Commit e51c9f6

Browse files
committed
Experimental new approach to lxml.etree that seems to fit better with the library philsophy but can't represent all possible html documents
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401092
1 parent 447b711 commit e51c9f6

File tree

5 files changed

+204
-3
lines changed

5 files changed

+204
-3
lines changed

src/html5lib/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,3 +1041,6 @@
10411041
"tis-620",
10421042
"hz-gb-2312",
10431043
))
1044+
1045+
class DataLossWarning(UserWarning):
1046+
pass

src/html5lib/html5parser.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,7 @@ def processEndTag(self, name):
427427
class RootElementPhase(Phase):
428428
# helper methods
429429
def insertHtmlElement(self):
430-
element = self.tree.createElement("html", {})
431-
self.tree.openElements.append(element)
432-
self.tree.document.appendChild(element)
430+
self.tree.insertRoot("html")
433431
self.parser.phase = self.parser.phases["beforeHead"]
434432

435433
# other

src/html5lib/treebuilders/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
5858
elif treeType == "beautifulsoup":
5959
import soup
6060
treeBuilderCache[treeType] = soup.TreeBuilder
61+
elif treeType == "lxml":
62+
import etree_lxml
63+
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
6164
elif treeType == "etree":
6265
import etree
6366
# XXX: NEVER cache here, caching is done in the etree submodule

src/html5lib/treebuilders/_base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@ def elementInActiveFormattingElements(self, name):
207207
return item
208208
return False
209209

210+
def insertRoot(self, name):
211+
element = self.createElement("html", {})
212+
self.openElements.append(element)
213+
self.document.appendChild(element)
214+
210215
def insertDoctype(self, name, publicId, systemId):
211216
doctype = self.doctypeClass(name)
212217
doctype.publicId = publicId
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import _base
2+
import new
3+
import warnings
4+
from html5lib.constants import DataLossWarning
5+
import etree as etree_builders
6+
try:
7+
import lxml.html as etree
8+
except ImportError:
9+
import lxml.etree as etree
10+
11+
fullTree = True
12+
13+
"""Module for supporting the lxml.etree library. The idea here is to use as much
14+
of the native library as possible, without using fragile hacks like custom element
15+
names that break between releases. The downside of this is that we cannot represent
16+
all possible trees; specifically the following are known to cause problems:
17+
18+
Text or comments as siblings of the root element
19+
Doctypes with mixed case names
20+
Docypes with no name
21+
22+
When any of these things occur, we emit a DataLossWarning
23+
"""
24+
25+
class DocumentType(object):
26+
def __init__(self, name, publicId = None, systemId = None):
27+
self.name = name
28+
if name != name.lower():
29+
warnings.warn("lxml does not preserve doctype case", DataLossWarning)
30+
self.publicId = publicId
31+
self.systemId = systemId
32+
33+
class Document(object):
34+
def __init__(self):
35+
self._elementTree = None
36+
self._childNodes = []
37+
38+
def appendChild(self, element):
39+
warnings.warn("lxml does not support comments as siblings of the root node", DataLossWarning)
40+
41+
def _getChildNodes(self):
42+
return self._childNodes
43+
44+
childNodes = property(_getChildNodes)
45+
46+
def testSerializer(element):
47+
rv = []
48+
finalText = None
49+
def serializeElement(element, indent=0):
50+
if not hasattr(element, "tag"):
51+
rv.append("#document")
52+
if element.docinfo.internalDTD:
53+
dtd_str = element.docinfo.doctype
54+
if not dtd_str:
55+
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
56+
rv.append("|%s%s"%(' '*(indent+2), dtd_str))
57+
serializeElement(element.getroot(), indent+2)
58+
elif type(element.tag) == type(etree.Comment):
59+
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
60+
else:
61+
rv.append("|%s<%s>"%(' '*indent, element.tag))
62+
if hasattr(element, "attrib"):
63+
for name, value in element.attrib.iteritems():
64+
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
65+
if element.text:
66+
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
67+
indent += 2
68+
for child in element.getchildren():
69+
serializeElement(child, indent)
70+
if hasattr(element, "tail") and element.tail:
71+
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
72+
serializeElement(element, 0)
73+
74+
if finalText is not None:
75+
rv.append("|%s\"%s\""%(' '*2, finalText))
76+
77+
return "\n".join(rv)
78+
79+
def tostring(element):
80+
"""Serialize an element and its child nodes to a string"""
81+
rv = []
82+
finalText = None
83+
def serializeElement(element):
84+
if not hasattr(element, "tag"):
85+
if element.docinfo.internalDTD:
86+
if element.docinfo.doctype:
87+
dtd_str = element.docinfo.doctype
88+
else:
89+
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
90+
rv.append(dtd_str)
91+
serializeElement(element.getroot())
92+
93+
elif type(element.tag) == type(etree.Comment):
94+
rv.append("<!--%s-->"%(element.text,))
95+
96+
else:
97+
#This is assumed to be an ordinary element
98+
if not element.attrib:
99+
rv.append("<%s>"%(element.tag,))
100+
else:
101+
attr = " ".join(["%s=\"%s\""%(name, value)
102+
for name, value in element.attrib.iteritems()])
103+
rv.append("<%s %s>"%(element.tag, attr))
104+
if element.text:
105+
rv.append(element.text)
106+
107+
for child in element.getchildren():
108+
serializeElement(child)
109+
110+
rv.append("</%s>"%(element.tag,))
111+
112+
if hasattr(element, "tail") and element.tail:
113+
rv.append(element.tail)
114+
115+
serializeElement(element)
116+
117+
if finalText is not None:
118+
rv.append("%s\""%(' '*2, finalText))
119+
120+
return "".join(rv)
121+
122+
class TreeBuilder(_base.TreeBuilder):
123+
documentClass = Document
124+
doctypeClass = DocumentType
125+
elementClass = None
126+
commentClass = None
127+
fragmentClass = None
128+
129+
def __init__(self, fullTree = False):
130+
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
131+
self.elementClass = builder.Element
132+
self.commentClass = builder.Comment
133+
self.fragmentClass = builder.DocumentFragment
134+
_base.TreeBuilder.__init__(self)
135+
136+
def reset(self):
137+
_base.TreeBuilder.reset(self)
138+
self.insertComment = self.insertCommentInitial
139+
self.doctype = None
140+
141+
def testSerializer(self, element):
142+
return testSerializer(element)
143+
144+
def getDocument(self):
145+
if fullTree:
146+
return self.document._elementTree
147+
else:
148+
return self.document._elementTree.getroot()
149+
150+
def getFragment(self):
151+
return _base.TreeBuilder.getFragment(self)._element
152+
153+
def insertDoctype(self, name, publicId, systemId):
154+
if not name:
155+
warnings.warn("lxml cannot represent null doctype", DataLossWarning)
156+
doctype = self.doctypeClass(name)
157+
doctype.publicId = publicId
158+
doctype.systemId = systemId
159+
self.doctype = doctype
160+
161+
def insertCommentInitial(self, data, parent=None):
162+
warnings.warn("lxml does not support comments as siblings of the root node", DataLossWarning)
163+
164+
def insertRoot(self, name):
165+
"""Create the document root"""
166+
#Because of the way libxml2 works, it doesn't seem to be possible to alter information
167+
#like the doctype after the tree has been parsed. Therefore we need to use the built-in
168+
#parser to create our iniial tree, after which we can add elements like normal
169+
docStr = ""
170+
if self.doctype:
171+
docStr += "<!DOCTYPE %s"%self.doctype.name
172+
if self.doctype.publicId is not None:
173+
docStr += "PUBLIC %s"%self.doctype.publicId
174+
if self.doctype.systemId:
175+
docStr += "SYSTEM %s"%self.doctype.systemId
176+
docStr += ">"
177+
docStr += "<html></html>"
178+
179+
root = etree.fromstring(docStr)
180+
181+
#Create the root document and add the ElementTree to it
182+
self.document = self.documentClass()
183+
self.document._elementTree = root.getroottree()
184+
185+
#Add the root element to the internal child/open data structures
186+
root_element = self.elementClass(name)
187+
root_element._element = root
188+
self.document._childNodes.append(root_element)
189+
self.openElements.append(root_element)
190+
191+
#Reset to the default insert comment function
192+
self.insertComment = super(TreeBuilder, self).insertComment

0 commit comments

Comments
 (0)