1
+ import _base
2
+ import new
3
+ import warnings
4
+ from html5lib .constants import DataLossWarning
5
+ import etree as etree_builders
6
+ try :
7
+ import lxml .html as etree
8
+ except ImportError :
9
+ import lxml .etree as etree
10
+
11
+ fullTree = True
12
+
13
+ """Module for supporting the lxml.etree library. The idea here is to use as much
14
+ of the native library as possible, without using fragile hacks like custom element
15
+ names that break between releases. The downside of this is that we cannot represent
16
+ all possible trees; specifically the following are known to cause problems:
17
+
18
+ Text or comments as siblings of the root element
19
+ Doctypes with mixed case names
20
+ Docypes with no name
21
+
22
+ When any of these things occur, we emit a DataLossWarning
23
+ """
24
+
25
+ class DocumentType (object ):
26
+ def __init__ (self , name , publicId = None , systemId = None ):
27
+ self .name = name
28
+ if name != name .lower ():
29
+ warnings .warn ("lxml does not preserve doctype case" , DataLossWarning )
30
+ self .publicId = publicId
31
+ self .systemId = systemId
32
+
33
+ class Document (object ):
34
+ def __init__ (self ):
35
+ self ._elementTree = None
36
+ self ._childNodes = []
37
+
38
+ def appendChild (self , element ):
39
+ warnings .warn ("lxml does not support comments as siblings of the root node" , DataLossWarning )
40
+
41
+ def _getChildNodes (self ):
42
+ return self ._childNodes
43
+
44
+ childNodes = property (_getChildNodes )
45
+
46
+ def testSerializer (element ):
47
+ rv = []
48
+ finalText = None
49
+ def serializeElement (element , indent = 0 ):
50
+ if not hasattr (element , "tag" ):
51
+ rv .append ("#document" )
52
+ if element .docinfo .internalDTD :
53
+ dtd_str = element .docinfo .doctype
54
+ if not dtd_str :
55
+ dtd_str = "<!DOCTYPE %s>" % element .docinfo .root_name
56
+ rv .append ("|%s%s" % (' ' * (indent + 2 ), dtd_str ))
57
+ serializeElement (element .getroot (), indent + 2 )
58
+ elif type (element .tag ) == type (etree .Comment ):
59
+ rv .append ("|%s<!-- %s -->" % (' ' * indent , element .text ))
60
+ else :
61
+ rv .append ("|%s<%s>" % (' ' * indent , element .tag ))
62
+ if hasattr (element , "attrib" ):
63
+ for name , value in element .attrib .iteritems ():
64
+ rv .append ('|%s%s="%s"' % (' ' * (indent + 2 ), name , value ))
65
+ if element .text :
66
+ rv .append ("|%s\" %s\" " % (' ' * (indent + 2 ), element .text ))
67
+ indent += 2
68
+ for child in element .getchildren ():
69
+ serializeElement (child , indent )
70
+ if hasattr (element , "tail" ) and element .tail :
71
+ rv .append ("|%s\" %s\" " % (' ' * (indent - 2 ), element .tail ))
72
+ serializeElement (element , 0 )
73
+
74
+ if finalText is not None :
75
+ rv .append ("|%s\" %s\" " % (' ' * 2 , finalText ))
76
+
77
+ return "\n " .join (rv )
78
+
79
+ def tostring (element ):
80
+ """Serialize an element and its child nodes to a string"""
81
+ rv = []
82
+ finalText = None
83
+ def serializeElement (element ):
84
+ if not hasattr (element , "tag" ):
85
+ if element .docinfo .internalDTD :
86
+ if element .docinfo .doctype :
87
+ dtd_str = element .docinfo .doctype
88
+ else :
89
+ dtd_str = "<!DOCTYPE %s>" % element .docinfo .root_name
90
+ rv .append (dtd_str )
91
+ serializeElement (element .getroot ())
92
+
93
+ elif type (element .tag ) == type (etree .Comment ):
94
+ rv .append ("<!--%s-->" % (element .text ,))
95
+
96
+ else :
97
+ #This is assumed to be an ordinary element
98
+ if not element .attrib :
99
+ rv .append ("<%s>" % (element .tag ,))
100
+ else :
101
+ attr = " " .join (["%s=\" %s\" " % (name , value )
102
+ for name , value in element .attrib .iteritems ()])
103
+ rv .append ("<%s %s>" % (element .tag , attr ))
104
+ if element .text :
105
+ rv .append (element .text )
106
+
107
+ for child in element .getchildren ():
108
+ serializeElement (child )
109
+
110
+ rv .append ("</%s>" % (element .tag ,))
111
+
112
+ if hasattr (element , "tail" ) and element .tail :
113
+ rv .append (element .tail )
114
+
115
+ serializeElement (element )
116
+
117
+ if finalText is not None :
118
+ rv .append ("%s\" " % (' ' * 2 , finalText ))
119
+
120
+ return "" .join (rv )
121
+
122
+ class TreeBuilder (_base .TreeBuilder ):
123
+ documentClass = Document
124
+ doctypeClass = DocumentType
125
+ elementClass = None
126
+ commentClass = None
127
+ fragmentClass = None
128
+
129
+ def __init__ (self , fullTree = False ):
130
+ builder = etree_builders .getETreeModule (etree , fullTree = fullTree )
131
+ self .elementClass = builder .Element
132
+ self .commentClass = builder .Comment
133
+ self .fragmentClass = builder .DocumentFragment
134
+ _base .TreeBuilder .__init__ (self )
135
+
136
+ def reset (self ):
137
+ _base .TreeBuilder .reset (self )
138
+ self .insertComment = self .insertCommentInitial
139
+ self .doctype = None
140
+
141
+ def testSerializer (self , element ):
142
+ return testSerializer (element )
143
+
144
+ def getDocument (self ):
145
+ if fullTree :
146
+ return self .document ._elementTree
147
+ else :
148
+ return self .document ._elementTree .getroot ()
149
+
150
+ def getFragment (self ):
151
+ return _base .TreeBuilder .getFragment (self )._element
152
+
153
+ def insertDoctype (self , name , publicId , systemId ):
154
+ if not name :
155
+ warnings .warn ("lxml cannot represent null doctype" , DataLossWarning )
156
+ doctype = self .doctypeClass (name )
157
+ doctype .publicId = publicId
158
+ doctype .systemId = systemId
159
+ self .doctype = doctype
160
+
161
+ def insertCommentInitial (self , data , parent = None ):
162
+ warnings .warn ("lxml does not support comments as siblings of the root node" , DataLossWarning )
163
+
164
+ def insertRoot (self , name ):
165
+ """Create the document root"""
166
+ #Because of the way libxml2 works, it doesn't seem to be possible to alter information
167
+ #like the doctype after the tree has been parsed. Therefore we need to use the built-in
168
+ #parser to create our iniial tree, after which we can add elements like normal
169
+ docStr = ""
170
+ if self .doctype :
171
+ docStr += "<!DOCTYPE %s" % self .doctype .name
172
+ if self .doctype .publicId is not None :
173
+ docStr += "PUBLIC %s" % self .doctype .publicId
174
+ if self .doctype .systemId :
175
+ docStr += "SYSTEM %s" % self .doctype .systemId
176
+ docStr += ">"
177
+ docStr += "<html></html>"
178
+
179
+ root = etree .fromstring (docStr )
180
+
181
+ #Create the root document and add the ElementTree to it
182
+ self .document = self .documentClass ()
183
+ self .document ._elementTree = root .getroottree ()
184
+
185
+ #Add the root element to the internal child/open data structures
186
+ root_element = self .elementClass (name )
187
+ root_element ._element = root
188
+ self .document ._childNodes .append (root_element )
189
+ self .openElements .append (root_element )
190
+
191
+ #Reset to the default insert comment function
192
+ self .insertComment = super (TreeBuilder , self ).insertComment
0 commit comments