Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs

Drew Hubl · gsnedders · commit f64486590942 · 2015-04-29T00:09:54.000+01:00
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -32,4 +32,6 @@ Patches and suggestions
 - Juan Carlos Garcia Segovia
 - Mike West
 - Marc DM
+- Drew Hubl
+- Austin Kumbera
 - Jim Baker
diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py
@@ -2,11 +2,26 @@
 
 import re
 from xml.sax.saxutils import escape, unescape
+from six.moves import urllib_parse as urlparse
 
 from .tokenizer import HTMLTokenizer
 from .constants import tokenTypes
 
 
+content_type_rgx = re.compile(r'''
+                               ^
+                               # Match a content type <application>/<type>
+                               (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                               # Match any character set and encoding
+                               (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                               # Assume the rest is data
+                               ,.*
+                               $
+                               ''',
+                              re.VERBOSE)
+
+
 class HTMLSanitizerMixin(object):
     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
 
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
     acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
                             'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
                             'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-                            'ssh', 'sftp', 'rtsp', 'afs']
+                            'ssh', 'sftp', 'rtsp', 'afs', 'data']
+
+    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
 
     # subclasses may define their own versions of these constants
     allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
     allowed_css_keywords = acceptable_css_keywords
     allowed_svg_properties = acceptable_svg_properties
     allowed_protocols = acceptable_protocols
+    allowed_content_types = acceptable_content_types
 
     # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
     # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
                                        unescape(attrs[attr])).lower()
                 # remove replacement characters from unescaped characters
                 val_unescaped = val_unescaped.replace("\ufffd", "")
-                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
-                    (val_unescaped.split(':')[0] not in
-                     self.allowed_protocols)):
-                    del attrs[attr]
+                uri = urlparse.urlparse(val_unescaped)
+                if uri:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = content_type_rgx.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        if m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
             for attr in self.svg_attr_val_allows_ref:
                 if attr in attrs:
                     attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
@@ -80,9 +80,12 @@ def test_sanitizer():
             continue  # TODO
         if attribute_name == 'style':
             continue
+        attribute_value = 'foo'
+        if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
+            attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
         yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
-               "<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
-               "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
+               "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
+               "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
                toxml)
 
     for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
                toxml)
 
     for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
-        yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
-               "<a href=\"%s\">foo</a>" % protocol,
-               """<a href="%s">foo</a>""" % protocol,
+        rest_of_uri = '//sub.domain.tld/path/object.ext'
+        if protocol == 'data':
+            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
+        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
                toxml)
 
     for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
+        rest_of_uri = '//sub.domain.tld/path/object.ext'
+        if protocol == 'data':
+            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
+        protocol = protocol.upper()
         yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
-               "<a href=\"%s\">foo</a>" % protocol,
-               """<a href="%s">foo</a>""" % protocol,
+               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
                toxml)