Skip to content

Commit 5999365

Browse files
committed
Support publicIds and systemIds in doctypes
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401142
1 parent afe181d commit 5999365

File tree

9 files changed

+66
-31
lines changed

9 files changed

+66
-31
lines changed

src/html5lib/treebuilders/_base.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,9 +213,7 @@ def insertRoot(self, name):
213213
self.document.appendChild(element)
214214

215215
def insertDoctype(self, name, publicId, systemId):
216-
doctype = self.doctypeClass(name)
217-
doctype.publicId = publicId
218-
doctype.systemId = systemId
216+
doctype = self.doctypeClass(name, publicId, systemId)
219217
self.document.appendChild(doctype)
220218

221219
def insertComment(self, data, parent=None):

src/html5lib/treebuilders/dom.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def serializeElement(element, indent=0):
141141
if element.publicId or element.systemId:
142142
publicId = element.publicId or ""
143143
systemId = element.systemId or ""
144-
rv.append( """|%s<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
144+
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
145145
' '*indent, element.name, publicId, systemId))
146146
else:
147147
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))

src/html5lib/treebuilders/etree.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,14 @@ def _setData(self, value):
132132
data = property(_getData, _setData)
133133

134134
class DocumentType(Element):
135-
def __init__(self, name):
135+
def __init__(self, name, publicId, systemId):
136136
Element.__init__(self, "<!DOCTYPE>")
137137
self._element.text = name
138+
self.publicId = publicId
139+
self.systemId = systemId
138140

139141
def _getPublicId(self):
140-
return self._element.get(u"publicId", None)
142+
return self._element.get(u"publicId", "")
141143

142144
def _setPublicId(self, value):
143145
if value is not None:
@@ -146,7 +148,7 @@ def _setPublicId(self, value):
146148
publicId = property(_getPublicId, _setPublicId)
147149

148150
def _getSystemId(self):
149-
return self._element.get(u"systemId", None)
151+
return self._element.get(u"systemId", "")
150152

151153
def _setSystemId(self, value):
152154
if value is not None:
@@ -172,7 +174,7 @@ def serializeElement(element, indent=0):
172174
if element.get("publicId") or element.get("systemId"):
173175
publicId = element.get("publicId") or ""
174176
systemId = element.get("systemId") or ""
175-
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
177+
rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
176178
element.text, publicId, systemId))
177179
else:
178180
rv.append("<!DOCTYPE %s>"%(element.text,))

src/html5lib/treebuilders/etree_lxml.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@
2323
"""
2424

2525
class DocumentType(object):
26-
def __init__(self, name, publicId = None, systemId = None):
26+
def __init__(self, name, publicId, systemId):
2727
self.name = name
2828
if name != name.lower():
29-
warnings.warn("lxml does not preserve doctype case", DataLossWarning)
29+
warnings.warn("lxml does not preserve doctype case", DataLossWarning)
3030
self.publicId = publicId
3131
self.systemId = systemId
3232

@@ -56,7 +56,7 @@ def serializeElement(element, indent=0):
5656
element.docinfo.system_url):
5757
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
5858
else:
59-
dtd_str = """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
59+
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
6060
element.docinfo.root_name,
6161
element.docinfo.public_id,
6262
element.docinfo.system_url)
@@ -181,9 +181,7 @@ def getFragment(self):
181181
def insertDoctype(self, name, publicId, systemId):
182182
if not name:
183183
warnings.warn("lxml cannot represent null doctype", DataLossWarning)
184-
doctype = self.doctypeClass(name)
185-
doctype.publicId = publicId
186-
doctype.systemId = systemId
184+
doctype = self.doctypeClass(name, publicId, systemId)
187185
self.doctype = doctype
188186

189187
def insertCommentInitial(self, data, parent=None):
@@ -196,7 +194,7 @@ def insertRoot(self, name):
196194
#Therefore we need to use the built-in parser to create our iniial
197195
#tree, after which we can add elements like normal
198196
docStr = ""
199-
if self.doctype:
197+
if self.doctype and self.doctype.name:
200198
docStr += "<!DOCTYPE %s"%self.doctype.name
201199
if self.doctype.publicId is not None or self.doctype.systemId is not None:
202200
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",

src/html5lib/treebuilders/simpletree.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -106,17 +106,15 @@ def __unicode__(self):
106106

107107
class DocumentType(Node):
108108
type = 3
109-
def __init__(self, name):
109+
def __init__(self, name, publicId, systemId):
110110
Node.__init__(self, name)
111-
self.publicId = u""
112-
self.systemId = u""
111+
self.publicId = publicId
112+
self.systemId = systemId
113113

114114
def __unicode__(self):
115115
if self.publicId or self.systemId:
116-
publicId = self.publicId or ""
117-
systemId = self.systemId or ""
118-
return """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
119-
self.name, publicId, systemId)
116+
return """<!DOCTYPE %s "%s" "%s">"""%(
117+
self.name, self.publicId, self.systemId)
120118

121119
else:
122120
return u"<!DOCTYPE %s>" % self.name

src/html5lib/treebuilders/soup.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,11 @@ def documentClass(self):
104104
return Element(self.soup, self.soup)
105105

106106
def insertDoctype(self, name, publicId, systemId):
107-
if publicId or systemId:
108-
publicId = publicId or ""
109-
systemId = systemId or ""
107+
if publicId:
110108
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId)))
109+
elif systemId:
110+
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
111+
(name, systemId)))
111112
else:
112113
self.soup.insert(0, Declaration(name))
113114

@@ -135,10 +136,25 @@ def getFragment(self):
135136
return _base.TreeBuilder.getFragment(self).element
136137

137138
def testSerializer(element):
139+
import re
138140
rv = []
139141
def serializeElement(element, indent=0):
140142
if isinstance(element, Declaration):
141-
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
143+
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
144+
m = re.compile(doctype_regexp).match(element.string)
145+
assert m is not None, "DOCTYPE did not match expected format"
146+
name = m.group('name')
147+
publicId = m.group('publicId')
148+
if publicId is not None:
149+
systemId = m.group('systemId1')
150+
else:
151+
systemId = m.group('systemId2')
152+
153+
if publicId is not None or systemId is not None:
154+
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
155+
(' '*indent, name, publicId or "", systemId or ""))
156+
else:
157+
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
142158

143159
elif isinstance(element, BeautifulSoup):
144160
if element.name == "[document_fragment]":

src/html5lib/treewalkers/etree.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def getNodeDetails(self, node):
5252
return (_base.DOCUMENT,)
5353

5454
elif node.tag == "<!DOCTYPE>":
55-
return _base.DOCTYPE, node.text
55+
return (_base.DOCTYPE, node.text,
56+
node.get("publicId"), node.get("systemId"))
5657

5758
elif type(node.tag) == type(ElementTree.Comment):
5859
return _base.COMMENT, node.text

src/html5lib/treewalkers/lxmletree.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ def __getitem__(self, key):
3333
def getnext(self):
3434
return None
3535

36+
def __len__(self):
37+
return 1
38+
3639
class Doctype(object):
3740
def __init__(self, root_node, name, public_id, system_id):
3841
self.root_node = root_node
@@ -91,6 +94,9 @@ def getparent(self):
9194
def __str__(self):
9295
return str(self.obj)
9396

97+
def __len__(self):
98+
return len(self.obj)
99+
94100

95101
class TreeWalker(_base.NonRecursiveTreeWalker):
96102
def __init__(self, tree):
@@ -119,12 +125,12 @@ def getNodeDetails(self, node):
119125

120126
else:
121127
#This is assumed to be an ordinary element
122-
return _base.ELEMENT, node.tag, node.attrib.items(), bool(node) or node.text
128+
return _base.ELEMENT, node.tag, node.attrib.items(), len(node) > 0 or node.text
123129

124130
def getFirstChild(self, node):
125131
assert not isinstance(node, tuple), _("Text nodes have no children")
126132

127-
assert bool(node) or node.text, "Node has no children"
133+
assert len(node) or node.text, "Node has no children"
128134
if node.text:
129135
return (node, "text")
130136
else:
@@ -137,7 +143,7 @@ def getNextSibling(self, node):
137143
if key == "text":
138144
# XXX: we cannot use a "bool(node) and node[0] or None" construct here
139145
# because node[0] might evaluate to False if it has no child element
140-
if bool(node):
146+
if len(node):
141147
return node[0]
142148
else:
143149
return None

src/html5lib/treewalkers/soup.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import gettext
23
_ = gettext.gettext
34

@@ -6,13 +7,28 @@
67
import _base
78

89
class TreeWalker(_base.NonRecursiveTreeWalker):
10+
doctype_regexp = re.compile(
11+
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
912
def getNodeDetails(self, node):
1013
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
1114
return (_base.DOCUMENT,)
1215

1316
elif isinstance(node, Declaration): # DocumentType
1417
#Slice needed to remove markup added during unicode conversion
15-
return _base.DOCTYPE, unicode(node.string)[2:-1]
18+
m = self.doctype_regexp.match(unicode(node.string)[2:-1])
19+
#This regexp approach seems wrong and fragile
20+
#but beautiful soup stores the doctype as a single thing and we want the seperate bits
21+
#It should work as long as the tree is created by html5lib itself but may be wrong if it's
22+
#been modified at all
23+
#We could just feed to it a html5lib tokenizer, I guess...
24+
assert m is not None, "DOCTYPE did not match expected format"
25+
name = m.group('name')
26+
publicId = m.group('publicId')
27+
if publicId is not None:
28+
systemId = m.group('systemId1')
29+
else:
30+
systemId = m.group('systemId2')
31+
return _base.DOCTYPE, name, publicId or "", systemId or ""
1632

1733
elif isinstance(node, Comment):
1834
return _base.COMMENT, unicode(node.string)[4:-3]

0 commit comments

Comments
 (0)