Skip to content

Commit 114ab64

Browse files
committed
Make the Python html5lib tree-walker API use a dict with (namespace, name): value key: value pairs for attributes.
I promise I won't update this part of the API again — this is now something I'm happy with, so I won't do what I've just done and change it twice in six^W seven months again. The only possible slight update from this is to move to using an ordered dict for trees that can preserve attribute order, but this should have no API breakage.
1 parent bc4ceca commit 114ab64

File tree

10 files changed

+64
-75
lines changed

10 files changed

+64
-75
lines changed

html5lib/filters/inject_meta_charset.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,32 +20,26 @@ def __iter__(self):
2020
if token["name"].lower() == "meta":
2121
# replace charset with actual encoding
2222
has_http_equiv_content_type = False
23-
content_index = -1
24-
for i,attr in enumerate(token["data"]):
25-
namespace = attr["namespace"]
26-
name = attr["name"]
27-
value = attr["value"]
23+
for (namespace,name),value in token["data"].iteritems():
2824
if namespace != None:
2925
continue
3026
elif name.lower() == 'charset':
31-
token["data"][i]["value"] = self.encoding
27+
token["data"][(namespace,name)] = self.encoding
3228
meta_found = True
3329
break
3430
elif name == 'http-equiv' and value.lower() == 'content-type':
3531
has_http_equiv_content_type = True
36-
elif name == 'content':
37-
content_index = i
3832
else:
39-
if has_http_equiv_content_type and content_index >= 0:
40-
token["data"][content_index]["value"] = u'text/html; charset=%s' % self.encoding
33+
if has_http_equiv_content_type and (None, "content") in token["data"]:
34+
token["data"][(None, "content")] = u'text/html; charset=%s' % self.encoding
4135
meta_found = True
4236

4337
elif token["name"].lower() == "head" and not meta_found:
4438
# insert meta into empty head
4539
yield {"type": "StartTag", "name": "head",
4640
"data": token["data"]}
4741
yield {"type": "EmptyTag", "name": "meta",
48-
"data": [{"namespace": None, "name": "charset", "value": self.encoding}]}
42+
"data": {(None, "charset"): self.encoding}}
4943
yield {"type": "EndTag", "name": "head"}
5044
meta_found = True
5145
continue
@@ -56,7 +50,7 @@ def __iter__(self):
5650
yield pending.pop(0)
5751
if not meta_found:
5852
yield {"type": "EmptyTag", "name": "meta",
59-
"data": [{"namespace": None, "name": "charset", "value": self.encoding}]}
53+
"data": {(None, "charset"): self.encoding}}
6054
while pending:
6155
yield pending.pop(0)
6256
meta_found = True

html5lib/serializer/htmlserializer.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -220,15 +220,11 @@ def serialize(self, treewalker, encoding=None):
220220
in_cdata = True
221221
elif in_cdata:
222222
self.serializeError(_("Unexpected child element of a CDATA element"))
223-
attrs = token["data"]
224-
if hasattr(attrs, "items"):
225-
attrs = attrs.items()
226-
attrs.sort()
227223
attributes = []
228-
for attr in attrs:
224+
for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
229225
#TODO: Add namespace support here
230-
k = attr["name"]
231-
v = attr["value"]
226+
k = attr_name
227+
v = attr_value
232228
if encoding:
233229
k = k.encode(encoding, "strict")
234230
attributes.append(' ')

html5lib/tests/test_serializer.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def __iter__(self):
3131
else:
3232
namespace = default_namespace
3333
name, attrib = token[1:3]
34-
yield self.startTag(namespace, name, attrib)
34+
yield self.startTag(namespace, name, self._convertAttrib(attrib))
3535
elif type == "EndTag":
3636
if len(token) == 3:
3737
namespace, name = token[1:3]
@@ -45,7 +45,7 @@ def __iter__(self):
4545
else:
4646
namespace = default_namespace
4747
name, attrib = token[1:]
48-
for token in self.emptyTag(namespace, name, attrib):
48+
for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
4949
yield token
5050
elif type == "Comment":
5151
yield self.comment(token[1])
@@ -61,6 +61,19 @@ def __iter__(self):
6161
yield self.doctype(token[1])
6262
else:
6363
raise ValueError("Unknown token type: " + type)
64+
65+
def _convertAttrib(self, attribs):
66+
"""html5lib tree-walkers use a dict of (namespace, name): value for
67+
attributes, but JSON cannot represent this. Convert from the format
68+
in the serializer tests (a list of dicts with "namespace", "name",
69+
and "value" as keys) to html5lib's tree-walker format."""
70+
attrs = {}
71+
for attrib in attribs:
72+
name = (attrib["namespace"], attrib["name"])
73+
assert(name not in attrs)
74+
attrs[name] = attrib["value"]
75+
return attrs
76+
6477

6578
class TestCase(unittest.TestCase):
6679
def addTest(cls, name, description, input, expected, xhtml, options):

html5lib/tests/test_treewalkers.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -196,17 +196,17 @@ def convertTokens(tokens):
196196
indent += 2
197197
attrs = token["data"]
198198
if attrs:
199-
attrs.sort(lambda a,b: cmp(a["name"], b["name"]))
200-
for attr in attrs:
201-
if attr["namespace"]:
202-
if attr["namespace"] in constants.prefixes:
203-
name = constants.prefixes[attr["namespace"]]
199+
#TODO: Remove this if statement, attrs should always exist
200+
for (namespace,name),value in sorted(attrs.items()):
201+
if namespace:
202+
if namespace in constants.prefixes:
203+
outputname = constants.prefixes[namespace]
204204
else:
205-
name = attr["namespace"]
206-
name += u" " + attr["name"]
205+
outputname = namespace
206+
outputname += u" " + name
207207
else:
208-
name = attr["name"]
209-
output.append(u"%s%s=\"%s\"" % (" "*indent, name, attr["value"]))
208+
outputname = name
209+
output.append(u"%s%s=\"%s\"" % (" "*indent, outputname, value))
210210
if type == "EmptyTag":
211211
indent -= 2
212212
elif type == "EndTag":
@@ -270,17 +270,17 @@ def runTest(self, innerHTML, input, expected, errors, treeClass):
270270
class TokenTestCase(unittest.TestCase):
271271
def test_all_tokens(self):
272272
expected = [
273-
{'data': [], 'type': 'StartTag', 'name': u'html'},
274-
{'data': [], 'type': 'StartTag', 'name': u'head'},
275-
{'data': [], 'type': 'EndTag', 'name': u'head'},
276-
{'data': [], 'type': 'StartTag', 'name': u'body'},
273+
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'},
274+
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'},
275+
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'},
276+
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'},
277277
{'data': u'a', 'type': 'Characters'},
278-
{'data': [], 'type': 'StartTag', 'name': u'div'},
278+
{'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'},
279279
{'data': u'b', 'type': 'Characters'},
280-
{'data': [], 'type': 'EndTag', 'name': u'div'},
280+
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'},
281281
{'data': u'c', 'type': 'Characters'},
282-
{'data': [], 'type': 'EndTag', 'name': u'body'},
283-
{'data': [], 'type': 'EndTag', 'name': u'html'}
282+
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'},
283+
{'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'}
284284
]
285285
for treeName, treeCls in treeTypes.iteritems():
286286
p = html5parser.HTMLParser(tree = treeCls["builder"])

html5lib/treewalkers/_base.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ def error(self, msg):
1515
return {"type": "SerializeError", "data": msg}
1616

1717
def normalizeAttrs(self, attrs):
18-
if not attrs:
19-
attrs = []
20-
for attr in attrs:
21-
attr["namespace"] = unicode(attr["namespace"]) if attr["namespace"] else None
22-
attr["name"] = unicode(attr["name"])
23-
attr["value"] = unicode(attr["value"])
24-
return attrs
18+
newattrs = {}
19+
if attrs:
20+
#TODO: treewalkers should always have attrs
21+
for (namespace,name),value in attrs.iteritems():
22+
namespace = unicode(namespace) if namespace else None
23+
name = unicode(name)
24+
value = unicode(value)
25+
newattrs[(namespace,name)] = value
26+
return newattrs
2527

2628
def emptyTag(self, namespace, name, attrs, hasChildren=False):
2729
yield {"type": "EmptyTag", "name": unicode(name),
@@ -40,7 +42,7 @@ def endTag(self, namespace, name):
4042
return {"type": "EndTag",
4143
"name": unicode(name),
4244
"namespace":unicode(namespace),
43-
"data": []}
45+
"data": {}}
4446

4547
def text(self, data):
4648
data = unicode(data)

html5lib/treewalkers/dom.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,10 @@ def getNodeDetails(self, node):
1515
return _base.TEXT, node.nodeValue
1616

1717
elif node.nodeType == Node.ELEMENT_NODE:
18-
attrs = []
18+
attrs = {}
1919
for attr in node.attributes.keys():
2020
attr = node.getAttributeNode(attr)
21-
attrs.append({"namespace": attr.namespaceURI,
22-
"name": attr.localName,
23-
"value": attr.value})
21+
attrs[(attr.namespaceURI,attr.localName)] = attr.value
2422
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
2523
attrs, node.hasChildNodes())
2624

html5lib/treewalkers/etree.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,13 @@ def getNodeDetails(self, node):
6969
else:
7070
namespace = None
7171
tag = node.tag
72-
attrs = []
72+
attrs = {}
7373
for name, value in node.attrib.items():
7474
match = tag_regexp.match(name)
7575
if match:
76-
attrs.append({"namespace": match.group(1),
77-
"name": match.group(2),
78-
"value": value})
76+
attrs[(match.group(1),match.group(2))] = value
7977
else:
80-
attrs.append({"namespace": None,
81-
"name": name,
82-
"value": value})
78+
attrs[(None,name)] = value
8379
return (_base.ELEMENT, namespace, tag,
8480
attrs, len(node) or node.text)
8581

html5lib/treewalkers/lxmletree.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,13 @@ def getNodeDetails(self, node):
140140
else:
141141
namespace = None
142142
tag = node.tag
143-
attrs = []
143+
attrs = {}
144144
for name, value in node.attrib.items():
145145
match = tag_regexp.match(name)
146146
if match:
147-
attrs.append({"namespace": match.group(1),
148-
"name": match.group(2),
149-
"value": value})
147+
attrs[(match.group(1),match.group(2))] = value
150148
else:
151-
attrs.append({"namespace": None,
152-
"name": name,
153-
"value": value})
149+
attrs[(None,name)] = value
154150
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
155151
attrs, len(node) > 0 or node.text)
156152

html5lib/treewalkers/pulldom.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,10 @@ def tokens(self, event, next):
3030
if type == START_ELEMENT:
3131
name = node.nodeName
3232
namespace = node.namespaceURI
33-
attrs = []
33+
attrs = {}
3434
for attr in node.attributes.keys():
3535
attr = node.getAttributeNode(attr)
36-
attrs.append({"namespace": attr.namespaceURI,
37-
"name": attr.localName,
38-
"value": attr.value})
36+
attrs[(attr.namespaceURI,attr.localName)] = attr.value
3937
if name in voidElements:
4038
for token in self.emptyTag(namespace,
4139
name,

html5lib/treewalkers/simpletree.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,12 @@ def getNodeDetails(self, node):
3232
return _base.TEXT, node.value
3333

3434
elif node.type == 5: # Element
35-
attrs = []
35+
attrs = {}
3636
for name, value in node.attributes.items():
3737
if isinstance(name, tuple):
38-
attrs.append({"namespace": name[2],
39-
"name": name[1],
40-
"value": value})
38+
attrs[(name[2],name[1])] = value
4139
else:
42-
attrs.append({"namespace": None,
43-
"name": name,
44-
"value": value})
40+
attrs[(None,name)] = value
4541
return (_base.ELEMENT, node.namespace, node.name,
4642
attrs, node.hasContent())
4743

0 commit comments

Comments
 (0)