Skip to content

Commit f644865

Browse files
Drew Hublgsnedders
Drew Hubl
authored andcommitted
Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs
1 parent b51828b commit f644865

File tree

3 files changed

+49
-12
lines changed

3 files changed

+49
-12
lines changed

AUTHORS.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,6 @@ Patches and suggestions
3232
- Juan Carlos Garcia Segovia
3333
- Mike West
3434
- Marc DM
35+
- Drew Hubl
36+
- Austin Kumbera
3537
- Jim Baker

html5lib/sanitizer.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,26 @@
22

33
import re
44
from xml.sax.saxutils import escape, unescape
5+
from six.moves import urllib_parse as urlparse
56

67
from .tokenizer import HTMLTokenizer
78
from .constants import tokenTypes
89

910

11+
content_type_rgx = re.compile(r'''
12+
^
13+
# Match a content type <application>/<type>
14+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15+
# Match any character set and encoding
16+
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17+
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18+
# Assume the rest is data
19+
,.*
20+
$
21+
''',
22+
re.VERBOSE)
23+
24+
1025
class HTMLSanitizerMixin(object):
1126
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
1227

@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138153
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
139154
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
140155
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
141-
'ssh', 'sftp', 'rtsp', 'afs']
156+
'ssh', 'sftp', 'rtsp', 'afs', 'data']
157+
158+
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
142159

143160
# subclasses may define their own versions of these constants
144161
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147164
allowed_css_keywords = acceptable_css_keywords
148165
allowed_svg_properties = acceptable_svg_properties
149166
allowed_protocols = acceptable_protocols
167+
allowed_content_types = acceptable_content_types
150168

151169
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152170
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189207
unescape(attrs[attr])).lower()
190208
# remove replacement characters from unescaped characters
191209
val_unescaped = val_unescaped.replace("\ufffd", "")
192-
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
193-
(val_unescaped.split(':')[0] not in
194-
self.allowed_protocols)):
195-
del attrs[attr]
210+
uri = urlparse.urlparse(val_unescaped)
211+
if uri:
212+
if uri.scheme not in self.allowed_protocols:
213+
del attrs[attr]
214+
if uri.scheme == 'data':
215+
m = content_type_rgx.match(uri.path)
216+
if not m:
217+
del attrs[attr]
218+
if m.group('content_type') not in self.allowed_content_types:
219+
del attrs[attr]
220+
196221
for attr in self.svg_attr_val_allows_ref:
197222
if attr in attrs:
198223
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

html5lib/tests/test_sanitizer.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def test_sanitizer():
8080
continue # TODO
8181
if attribute_name == 'style':
8282
continue
83+
attribute_value = 'foo'
84+
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
85+
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
8386
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
84-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
85-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
87+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
88+
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
8689
toxml)
8790

8891
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
9396
toxml)
9497

9598
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
96-
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
97-
"<a href=\"%s\">foo</a>" % protocol,
98-
"""<a href="%s">foo</a>""" % protocol,
99+
rest_of_uri = '//sub.domain.tld/path/object.ext'
100+
if protocol == 'data':
101+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
102+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
104+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
99105
toxml)
100106

101107
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
108+
rest_of_uri = '//sub.domain.tld/path/object.ext'
109+
if protocol == 'data':
110+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
111+
protocol = protocol.upper()
102112
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103-
"<a href=\"%s\">foo</a>" % protocol,
104-
"""<a href="%s">foo</a>""" % protocol,
113+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
114+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
105115
toxml)

0 commit comments

Comments
 (0)