Skip to content

Commit 139836d

Browse files
committed
Get lxml treebuilder passing tests again.
1 parent b2c4ede commit 139836d

File tree

4 files changed

+28
-9
lines changed

4 files changed

+28
-9
lines changed

html5lib/ihatexml.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import re
2+
import warnings
3+
4+
from .constants import DataLossWarning
25

36
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
47

@@ -117,10 +120,11 @@ def __init__(self, replaceChars = None,
117120

118121
def coerceAttribute(self, name, namespace=None):
119122
if self.dropXmlnsLocalName and name.startswith("xmlns:"):
120-
#Need a datalosswarning here
123+
warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
121124
return None
122125
elif (self.dropXmlnsAttrNs and
123126
namespace == "http://www.w3.org/2000/xmlns/"):
127+
warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
124128
return None
125129
else:
126130
return self.toXmlName(name)
@@ -131,11 +135,14 @@ def coerceElement(self, name, namespace=None):
131135
def coerceComment(self, data):
132136
if self.preventDoubleDashComments:
133137
while "--" in data:
138+
warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
134139
data = data.replace("--", "- -")
135140
return data
136141

137142
def coerceCharacters(self, data):
138143
if self.replaceFormFeedCharacters:
144+
for i in range(data.count("\x0C")):
145+
warnings.warn("Text cannot contain U+000C", DataLossWarning)
139146
data = data.replace("\x0C", " ")
140147
#Other non-xml characters
141148
return data
@@ -145,13 +152,15 @@ def toXmlName(self, name):
145152
nameRest = name[1:]
146153
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
147154
if m:
155+
warnings.warn("Coercing non-XML name", DataLossWarning)
148156
nameFirstOutput = self.getReplacementCharacter(nameFirst)
149157
else:
150158
nameFirstOutput = nameFirst
151159

152160
nameRestOutput = nameRest
153161
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
154162
for char in replaceChars:
163+
warnings.warn("Coercing non-XML name", DataLossWarning)
155164
replacement = self.getReplacementCharacter(char)
156165
nameRestOutput = nameRestOutput.replace(char, replacement)
157166
return nameFirstOutput + nameRestOutput

html5lib/treebuilders/etree.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,11 @@ def _setSystemId(self, value):
178178

179179
class Document(Element):
180180
def __init__(self):
181-
Element.__init__(self, "<DOCUMENT_ROOT>")
181+
Element.__init__(self, "DOCUMENT_ROOT")
182182

183183
class DocumentFragment(Element):
184184
def __init__(self):
185-
Element.__init__(self, "<DOCUMENT_FRAGMENT>")
185+
Element.__init__(self, "DOCUMENT_FRAGMENT")
186186

187187
def testSerializer(element):
188188
rv = []
@@ -198,7 +198,7 @@ def serializeElement(element, indent=0):
198198
element.text, publicId, systemId))
199199
else:
200200
rv.append("<!DOCTYPE %s>"%(element.text,))
201-
elif element.tag == "<DOCUMENT_ROOT>":
201+
elif element.tag == "DOCUMENT_ROOT":
202202
rv.append("#document")
203203
if element.text:
204204
rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
@@ -263,7 +263,7 @@ def serializeElement(element):
263263
element.text, publicId, systemId))
264264
else:
265265
rv.append("<!DOCTYPE %s>"%(element.text,))
266-
elif element.tag == "<DOCUMENT_ROOT>":
266+
elif element.tag == "DOCUMENT_ROOT":
267267
if element.text:
268268
rv.append(element.text)
269269
if element.tail:

html5lib/treebuilders/etree_lxml.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def serializeElement(element, indent=0):
8080
serializeElement(next_element, indent+2)
8181
elif type(element.tag) == type(etree.Comment):
8282
rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
83+
if hasattr(element, "tail") and element.tail:
84+
rv.append("|%s\"%s\"" %(' '*indent, element.tail))
8385
else:
8486
nsmatch = etree_builders.tag_regexp.match(element.tag)
8587
if nsmatch is not None:
@@ -113,8 +115,8 @@ def serializeElement(element, indent=0):
113115
indent += 2
114116
for child in element.getchildren():
115117
serializeElement(child, indent)
116-
if hasattr(element, "tail") and element.tail:
117-
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
118+
if hasattr(element, "tail") and element.tail:
119+
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
118120
serializeElement(element, 0)
119121

120122
if finalText is not None:
@@ -286,6 +288,12 @@ def insertDoctype(self, token):
286288

287289
def insertCommentInitial(self, data, parent=None):
288290
self.initial_comments.append(data)
291+
292+
def insertCommentMain(self, data, parent=None):
293+
if (parent == self.document and
294+
type(self.document._elementTree.getroot()[-1].tag) == type(etree.Comment)):
295+
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
296+
super().insertComment(data, parent)
289297

290298
def insertRoot(self, token):
291299
"""Create the document root"""
@@ -301,6 +309,8 @@ def insertRoot(self, token):
301309
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
302310
self.doctype.systemId or "")
303311
docStr += ">"
312+
if self.doctype.name != token["name"]:
313+
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
304314
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
305315

306316
try:
@@ -333,4 +343,4 @@ def insertRoot(self, token):
333343
self.openElements.append(root_element)
334344

335345
#Reset to the default insert comment function
336-
self.insertComment = super(TreeBuilder, self).insertComment
346+
self.insertComment = self.insertCommentMain

html5lib/treewalkers/etree.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def getNodeDetails(self, node):
3838
if not(hasattr(node, "tag")):
3939
node = node.getroot()
4040

41-
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
41+
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
4242
return (_base.DOCUMENT,)
4343

4444
elif node.tag == "<!DOCTYPE>":

0 commit comments

Comments
 (0)