diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..b9a8fc8b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,5 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Drew Hubl +- Austin Kumbera diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 469d9b40..6bbd872f 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -2,11 +2,26 @@ import re from xml.sax.saxutils import escape, unescape +from six.moves import urllib_parse as urlparse from .tokenizer import HTMLTokenizer from .constants import tokenTypes +content_type_rgx = re.compile(r''' + ^ + # Match a content type / + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) + |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) + # Assume the rest is data + ,.* + $ + ''', + re.VERBOSE) + + class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object): acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs'] + 'ssh', 'sftp', 'rtsp', 'afs', 'data'] + + acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements @@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object): allowed_css_keywords = acceptable_css_keywords allowed_svg_properties = acceptable_svg_properties allowed_protocols = acceptable_protocols + allowed_content_types = acceptable_content_types # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style @@ -189,10 +207,17 @@ def allowed_token(self, token, token_type): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] + uri = urlparse.urlparse(val_unescaped) + if uri: + if uri.scheme not in self.allowed_protocols: + del attrs[attr] + if uri.scheme == 'data': + m = content_type_rgx.match(uri.path) + if not m: + del attrs[attr] + if m.group('content_type') not in self.allowed_content_types: + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 1cc687df..4862570d 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -80,9 +80,12 @@ def test_sanitizer(): continue # TODO if attribute_name == 'style': continue + attribute_value = 'foo' + if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, - "

foo <bad>bar</bad> baz

" % attribute_name, - "

foo bar baz

" % attribute_name, + "

foo <bad>bar</bad> baz

" % (attribute_name, attribute_value), + "

foo bar baz

" % (attribute_name, attribute_value), toxml) for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: @@ -93,13 +96,20 @@ def test_sanitizer(): toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: - yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol, - "foo" % protocol, - """foo""" % protocol, + rest_of_uri = '//sub.domain.tld/path/object.ext' + if protocol == 'data': + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' + yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, + "foo" % (protocol, rest_of_uri), + """foo""" % (protocol, rest_of_uri), toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: + rest_of_uri = '//sub.domain.tld/path/object.ext' + if protocol == 'data': + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' + protocol = protocol.upper() yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, - "foo" % protocol, - """foo""" % protocol, + "foo" % (protocol, rest_of_uri), + """foo""" % (protocol, rest_of_uri), toxml) diff --git a/html5lib/tests/testdata b/html5lib/tests/testdata index a9badff0..f6a1b202 160000 --- a/html5lib/tests/testdata +++ b/html5lib/tests/testdata @@ -1 +1 @@ -Subproject commit a9badff0cd2fe337170769d42ca2df5e96d30f97 +Subproject commit f6a1b202de14fc057b196044c5ebef4672be3dd0