From b0c3975db8871453b200b1431a513540381f05fa Mon Sep 17 00:00:00 2001 From: blag Date: Mon, 28 Apr 2014 10:23:11 -0600 Subject: [PATCH 1/7] Allow Data URI Schemes From https://en.wikipedia.org/wiki/Data_URI_scheme, allow the psuedo-protocol 'data'. --- html5lib/sanitizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 469d9b40..9f1b2261 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -138,7 +138,7 @@ class HTMLSanitizerMixin(object): acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs'] + 'ssh', 'sftp', 'rtsp', 'afs', 'data'] # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements From a265d2d38c85deb1dc56c3f66b0421ea02f828a9 Mon Sep 17 00:00:00 2001 From: Drew Hubl Date: Thu, 22 May 2014 00:20:21 -0600 Subject: [PATCH 2/7] Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs --- html5lib/sanitizer.py | 51 +++++++++++++++++++++++++++++--- html5lib/tests/test_sanitizer.py | 24 ++++++++++----- html5lib/tests/testdata | 2 +- 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 9f1b2261..c4c34e08 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -2,6 +2,10 @@ import re from xml.sax.saxutils import escape, unescape +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse from .tokenizer import HTMLTokenizer from .constants import tokenTypes @@ -140,6 +144,8 @@ class HTMLSanitizerMixin(object): 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', 'ssh', 'sftp', 'rtsp', 'afs', 'data'] + acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] + # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes @@ -147,6 +153,7 @@ class HTMLSanitizerMixin(object): allowed_css_keywords = acceptable_css_keywords allowed_svg_properties = acceptable_svg_properties allowed_protocols = acceptable_protocols + allowed_content_types = acceptable_content_types # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style @@ -189,10 +196,46 @@ def allowed_token(self, token, token_type): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] + uri = urlparse(val_unescaped) + if uri: + if uri.scheme not in self.allowed_protocols: + del attrs[attr] + rgx = re.compile(r''' + ^ + # Match a content type / + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + # Note that this does not prevent the + # same one being set twice + # The charset group is currently unused + (?:;charset=(?P[-a-zA-Z0-9]+)|;(?Pbase64)){0,2} + # Match the base64-encoded or urlencoded + # data + # The data group is currently unused + (?P,(?P[a-zA-Z0-9+/]+=*|(?P[a-zA-Z0-9]+|%[a-fA-F0-9]{2}))) + $ + ''', + re.VERBOSE) + if uri.scheme == 'data': + m = rgx.match(uri.path) + if not m: + del attrs[attr] + if m.group('content_type') not in self.allowed_content_types: + del attrs[attr] + if m.group('encoding'): + if m.group('encoding') == 'base64': + # If the encoding identifier is base64, then + # make sure the data is encoded in base64 + if not m.group('base64_encoded_data'): + del attrs[attr] + else: + del attrs[attr] + else: + # If the encoding is not given, expect the data to + # be urlencoded + if not m.group('url_encoded_data'): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 1cc687df..ad1d6f18 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -80,9 +80,12 @@ def test_sanitizer(): continue # TODO if attribute_name == 'style': continue + attribute_value = 'foo' + if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: + attribute_value = 'http://sub.domain.tld/path/object.ext' yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, - "

foo <bad>bar</bad> baz

" % attribute_name, - "

foo bar baz

" % attribute_name, + "

foo <bad>bar</bad> baz

" % (attribute_name, attribute_value), + "

foo bar baz

" % (attribute_name, attribute_value), toxml) for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: @@ -93,13 +96,20 @@ def test_sanitizer(): toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: - yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol, - "foo" % protocol, - """foo""" % protocol, + rest_of_uri = '//sub.domain.tld/path/object.ext' + if protocol == 'data': + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' + yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, + "foo" % (protocol, rest_of_uri), + """foo""" % (protocol, rest_of_uri), toxml) for protocol in sanitizer.HTMLSanitizer.allowed_protocols: + rest_of_uri = '//sub.domain.tld/path/object.ext' + if protocol == 'data': + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' + protocol = protocol.upper() yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, - "foo" % protocol, - """foo""" % protocol, + "foo" % (protocol, rest_of_uri), + """foo""" % (protocol, rest_of_uri), toxml) diff --git a/html5lib/tests/testdata b/html5lib/tests/testdata index a9badff0..f6a1b202 160000 --- a/html5lib/tests/testdata +++ b/html5lib/tests/testdata @@ -1 +1 @@ -Subproject commit a9badff0cd2fe337170769d42ca2df5e96d30f97 +Subproject commit f6a1b202de14fc057b196044c5ebef4672be3dd0 From afa01fffaa63a92f3228da74b23cb81ee044a7ff Mon Sep 17 00:00:00 2001 From: Drew Hubl Date: Thu, 22 May 2014 01:11:16 -0600 Subject: [PATCH 3/7] Guarantee we use an allowed protocol --- html5lib/tests/test_sanitizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index ad1d6f18..4862570d 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -82,7 +82,7 @@ def test_sanitizer(): continue attribute_value = 'foo' if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: - attribute_value = 'http://sub.domain.tld/path/object.ext' + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, "

foo <bad>bar</bad> baz

" % (attribute_name, attribute_value), "

foo bar baz

" % (attribute_name, attribute_value), From 5210af72c932f7c48fced47e30537f99c5a61f93 Mon Sep 17 00:00:00 2001 From: Drew Hubl Date: Thu, 22 May 2014 01:18:53 -0600 Subject: [PATCH 4/7] Add ourselves to AUTHORS file --- AUTHORS.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 4148a6ed..b9a8fc8b 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -32,3 +32,5 @@ Patches and suggestions - Juan Carlos Garcia Segovia - Mike West - Marc DM +- Drew Hubl +- Austin Kumbera From a1abf71b147bad6d6fecc25003fa423e2e51aa80 Mon Sep 17 00:00:00 2001 From: Drew Hubl Date: Thu, 14 Aug 2014 16:22:58 -0600 Subject: [PATCH 5/7] Use six to import urlparse --- html5lib/sanitizer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index c4c34e08..8cd4b61d 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -2,10 +2,7 @@ import re from xml.sax.saxutils import escape, unescape -try: - from urllib.parse import urlparse -except ImportError: - from urlparse import urlparse +from six.moves import urllib_parse as urlparse from .tokenizer import HTMLTokenizer from .constants import tokenTypes @@ -196,7 +193,7 @@ def allowed_token(self, token, token_type): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - uri = urlparse(val_unescaped) + uri = urlparse.urlparse(val_unescaped) if uri: if uri.scheme not in self.allowed_protocols: del attrs[attr] From ee798c26368403635c6f60de2039b7c85f26e702 Mon Sep 17 00:00:00 2001 From: Drew Hubl Date: Thu, 14 Aug 2014 16:38:05 -0600 Subject: [PATCH 6/7] Rename and move content type regex to the top of the file so we only compile it once --- html5lib/sanitizer.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 8cd4b61d..a5a1237b 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -8,6 +8,24 @@ from .constants import tokenTypes +content_type_rgx = re.compile(r''' + ^ + # Match a content type / + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + # Note that this does not prevent the + # same one being set twice + # The charset group is currently unused + (?:;charset=(?P[-a-zA-Z0-9]+)|;(?Pbase64)){0,2} + # Match the base64-encoded or urlencoded + # data + # The data group is currently unused + (?P,(?P[a-zA-Z0-9+/]+=*|(?P[a-zA-Z0-9]+|%[a-fA-F0-9]{2}))) + $ + ''', + re.VERBOSE) + + class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -197,24 +215,8 @@ def allowed_token(self, token, token_type): if uri: if uri.scheme not in self.allowed_protocols: del attrs[attr] - rgx = re.compile(r''' - ^ - # Match a content type / - (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) - # Match any character set and encoding - # Note that this does not prevent the - # same one being set twice - # The charset group is currently unused - (?:;charset=(?P[-a-zA-Z0-9]+)|;(?Pbase64)){0,2} - # Match the base64-encoded or urlencoded - # data - # The data group is currently unused - (?P,(?P[a-zA-Z0-9+/]+=*|(?P[a-zA-Z0-9]+|%[a-fA-F0-9]{2}))) - $ - ''', - re.VERBOSE) if uri.scheme == 'data': - m = rgx.match(uri.path) + m = content_type_rgx.match(uri.path) if not m: del attrs[attr] if m.group('content_type') not in self.allowed_content_types: From 81176fc920d13980c5633b0e3703de20091b11cb Mon Sep 17 00:00:00 2001 From: Drew Hubl Date: Thu, 14 Aug 2014 17:10:48 -0600 Subject: [PATCH 7/7] Don't check the encoding or the data, and switch charset and encoding to non-capturing subpatterns --- html5lib/sanitizer.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index a5a1237b..6bbd872f 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -13,14 +13,10 @@ # Match a content type / (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) # Match any character set and encoding - # Note that this does not prevent the - # same one being set twice - # The charset group is currently unused - (?:;charset=(?P[-a-zA-Z0-9]+)|;(?Pbase64)){0,2} - # Match the base64-encoded or urlencoded - # data - # The data group is currently unused - (?P,(?P[a-zA-Z0-9+/]+=*|(?P[a-zA-Z0-9]+|%[a-fA-F0-9]{2}))) + (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) + |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) + # Assume the rest is data + ,.* $ ''', re.VERBOSE) @@ -221,19 +217,6 @@ def allowed_token(self, token, token_type): del attrs[attr] if m.group('content_type') not in self.allowed_content_types: del attrs[attr] - if m.group('encoding'): - if m.group('encoding') == 'base64': - # If the encoding identifier is base64, then - # make sure the data is encoded in base64 - if not m.group('base64_encoded_data'): - del attrs[attr] - else: - del attrs[attr] - else: - # If the encoding is not given, expect the data to - # be urlencoded - if not m.group('url_encoded_data'): - del attrs[attr] for attr in self.svg_attr_val_allows_ref: if attr in attrs: