Skip to content

Allow data URI schemes, with content type whitelist #156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ Patches and suggestions
- Juan Carlos Garcia Segovia
- Mike West
- Marc DM
- Drew Hubl
- Austin Kumbera
35 changes: 30 additions & 5 deletions html5lib/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,26 @@

import re
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse

from .tokenizer import HTMLTokenizer
from .constants import tokenTypes


content_type_rgx = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)


class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""

Expand Down Expand Up @@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs']
'ssh', 'sftp', 'rtsp', 'afs', 'data']

acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']

# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
Expand All @@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
allowed_content_types = acceptable_content_types

# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
Expand Down Expand Up @@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
del attrs[attr]
uri = urlparse.urlparse(val_unescaped)
if uri:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = content_type_rgx.match(uri.path)
if not m:
del attrs[attr]
if m.group('content_type') not in self.allowed_content_types:
del attrs[attr]

for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
Expand Down
24 changes: 17 additions & 7 deletions html5lib/tests/test_sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,12 @@ def test_sanitizer():
continue # TODO
if attribute_name == 'style':
continue
attribute_value = 'foo'
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
toxml)

for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
Expand All @@ -93,13 +96,20 @@ def test_sanitizer():
toxml)

for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
"<a href=\"%s\">foo</a>" % protocol,
"""<a href="%s">foo</a>""" % protocol,
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
toxml)

for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
protocol = protocol.upper()
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
"<a href=\"%s\">foo</a>" % protocol,
"""<a href="%s">foo</a>""" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
toxml)