Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs

Drew Hubl · CaptainCodeman · commit a265d2d38c85 · 2014-11-11T08:49:11.000-07:00
diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py
@@ -2,6 +2,10 @@
 
 import re
 from xml.sax.saxutils import escape, unescape
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
 
 from .tokenizer import HTMLTokenizer
 from .constants import tokenTypes
@@ -140,13 +144,16 @@ class HTMLSanitizerMixin(object):
                             'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
                             'ssh', 'sftp', 'rtsp', 'afs', 'data']
 
+    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
+
     # subclasses may define their own versions of these constants
     allowed_elements = acceptable_elements + mathml_elements + svg_elements
     allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
     allowed_css_properties = acceptable_css_properties
     allowed_css_keywords = acceptable_css_keywords
     allowed_svg_properties = acceptable_svg_properties
     allowed_protocols = acceptable_protocols
+    allowed_content_types = acceptable_content_types
 
     # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
     # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +196,46 @@ def allowed_token(self, token, token_type):
                                        unescape(attrs[attr])).lower()
                 # remove replacement characters from unescaped characters
                 val_unescaped = val_unescaped.replace("\ufffd", "")
-                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
-                    (val_unescaped.split(':')[0] not in
-                     self.allowed_protocols)):
-                    del attrs[attr]
+                uri = urlparse(val_unescaped)
+                if uri:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    rgx = re.compile(r'''
+                                      ^
+                                      # Match a content type <application>/<type>
+                                      (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                      # Match any character set and encoding
+                                      # Note that this does not prevent the
+                                      # same one being set twice
+                                      # The charset group is currently unused
+                                      (?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
+                                      # Match the base64-encoded or urlencoded
+                                      # data
+                                      # The data group is currently unused
+                                      (?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
+                                      $
+                                      ''',
+                                     re.VERBOSE)
+                    if uri.scheme == 'data':
+                        m = rgx.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        if m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+                        if m.group('encoding'):
+                            if m.group('encoding') == 'base64':
+                                # If the encoding identifier is base64, then
+                                # make sure the data is encoded in base64
+                                if not m.group('base64_encoded_data'):
+                                    del attrs[attr]
+                            else:
+                                del attrs[attr]
+                        else:
+                            # If the encoding is not given, expect the data to
+                            # be urlencoded
+                            if not m.group('url_encoded_data'):
+                                del attrs[attr]
+
             for attr in self.svg_attr_val_allows_ref:
                 if attr in attrs:
                     attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
@@ -80,9 +80,12 @@ def test_sanitizer():
             continue  # TODO
         if attribute_name == 'style':
             continue
+        attribute_value = 'foo'
+        if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
+            attribute_value = 'http://sub.domain.tld/path/object.ext'
         yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
-               "<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
-               "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
+               "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
+               "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
                toxml)
 
     for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
                toxml)
 
     for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
-        yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
-               "<a href=\"%s\">foo</a>" % protocol,
-               """<a href="%s">foo</a>""" % protocol,
+        rest_of_uri = '//sub.domain.tld/path/object.ext'
+        if protocol == 'data':
+            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
+        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
                toxml)
 
     for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
+        rest_of_uri = '//sub.domain.tld/path/object.ext'
+        if protocol == 'data':
+            rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
+        protocol = protocol.upper()
         yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
-               "<a href=\"%s\">foo</a>" % protocol,
-               """<a href="%s">foo</a>""" % protocol,
+               "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
                toxml)
diff --git a/html5lib/tests/testdata b/html5lib/tests/testdata
@@ -1 +1 @@
-Subproject commit a9badff0cd2fe337170769d42ca2df5e96d30f97
+Subproject commit f6a1b202de14fc057b196044c5ebef4672be3dd0