From a43ff9d0735998ba4bbb99b6977af255dc09faa3 Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Thu, 23 Feb 2017 13:22:58 -0500 Subject: [PATCH] Fix alphabeticalattributes filter namepsace problem If a tag has an attribute with a None namespace and one with a str namespace, then this filter would fail with a TypeError in Python 3. This fixes that. Fixes #322 --- html5lib/filters/alphabeticalattributes.py | 13 ++- html5lib/tests/test_alphabeticalattributes.py | 81 +++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 html5lib/tests/test_alphabeticalattributes.py diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py index 4795baec..f938ba1a 100644 --- a/html5lib/filters/alphabeticalattributes.py +++ b/html5lib/filters/alphabeticalattributes.py @@ -8,13 +8,24 @@ from ordereddict import OrderedDict +def _attr_key(attr): + """Return an appropriate key for an attribute for sorting + + Attributes have a namespace that can be either ``None`` or a string. We + can't compare the two because they're different types, so we convert + ``None`` to an empty string first. + + """ + return (attr[0][0] or ''), attr[0][1] + + class Filter(base.Filter): def __iter__(self): for token in base.Filter.__iter__(self): if token["type"] in ("StartTag", "EmptyTag"): attrs = OrderedDict() for name, value in sorted(token["data"].items(), - key=lambda x: x[0]): + key=_attr_key): attrs[name] = value token["data"] = attrs yield token diff --git a/html5lib/tests/test_alphabeticalattributes.py b/html5lib/tests/test_alphabeticalattributes.py new file mode 100644 index 00000000..9e560a1e --- /dev/null +++ b/html5lib/tests/test_alphabeticalattributes.py @@ -0,0 +1,81 @@ +from __future__ import absolute_import, division, unicode_literals + +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict + +import pytest + +import html5lib +from html5lib.filters.alphabeticalattributes import Filter +from html5lib.serializer import HTMLSerializer + + +@pytest.mark.parametrize('msg, attrs, expected_attrs', [ + ( + 'no attrs', + {}, + {} + ), + ( + 'one attr', + {(None, 'alt'): 'image'}, + OrderedDict([((None, 'alt'), 'image')]) + ), + ( + 'multiple attrs', + { + (None, 'src'): 'foo', + (None, 'alt'): 'image', + (None, 'style'): 'border: 1px solid black;' + }, + OrderedDict([ + ((None, 'alt'), 'image'), + ((None, 'src'), 'foo'), + ((None, 'style'), 'border: 1px solid black;') + ]) + ), +]) +def test_alphabetizing(msg, attrs, expected_attrs): + tokens = [{'type': 'StartTag', 'name': 'img', 'data': attrs}] + output_tokens = list(Filter(tokens)) + + attrs = output_tokens[0]['data'] + assert attrs == expected_attrs + + +def test_with_different_namespaces(): + tokens = [{ + 'type': 'StartTag', + 'name': 'pattern', + 'data': { + (None, 'id'): 'patt1', + ('http://www.w3.org/1999/xlink', 'href'): '#patt2' + } + }] + output_tokens = list(Filter(tokens)) + + attrs = output_tokens[0]['data'] + assert attrs == OrderedDict([ + ((None, 'id'), 'patt1'), + (('http://www.w3.org/1999/xlink', 'href'), '#patt2') + ]) + + +def test_with_serializer(): + """Verify filter works in the context of everything else""" + parser = html5lib.HTMLParser() + dom = parser.parseFragment('') + walker = html5lib.getTreeWalker('etree') + ser = HTMLSerializer( + alphabetical_attributes=True, + quote_attr_values='always' + ) + + # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When + # that gets fixed, we can fix this expected result. + assert ( + ser.render(walker(dom)) == + '' + )