Skip to content

Commit a265d2d

Browse files
Drew HublCaptainCodeman
Drew Hubl
authored andcommitted
Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs
1 parent b0c3975 commit a265d2d

File tree

3 files changed

+65
-12
lines changed

3 files changed

+65
-12
lines changed

html5lib/sanitizer.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
import re
44
from xml.sax.saxutils import escape, unescape
5+
try:
6+
from urllib.parse import urlparse
7+
except ImportError:
8+
from urlparse import urlparse
59

610
from .tokenizer import HTMLTokenizer
711
from .constants import tokenTypes
@@ -140,13 +144,16 @@ class HTMLSanitizerMixin(object):
140144
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
141145
'ssh', 'sftp', 'rtsp', 'afs', 'data']
142146

147+
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
148+
143149
# subclasses may define their own versions of these constants
144150
allowed_elements = acceptable_elements + mathml_elements + svg_elements
145151
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
146152
allowed_css_properties = acceptable_css_properties
147153
allowed_css_keywords = acceptable_css_keywords
148154
allowed_svg_properties = acceptable_svg_properties
149155
allowed_protocols = acceptable_protocols
156+
allowed_content_types = acceptable_content_types
150157

151158
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152159
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +196,46 @@ def allowed_token(self, token, token_type):
189196
unescape(attrs[attr])).lower()
190197
# remove replacement characters from unescaped characters
191198
val_unescaped = val_unescaped.replace("\ufffd", "")
192-
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
193-
(val_unescaped.split(':')[0] not in
194-
self.allowed_protocols)):
195-
del attrs[attr]
199+
uri = urlparse(val_unescaped)
200+
if uri:
201+
if uri.scheme not in self.allowed_protocols:
202+
del attrs[attr]
203+
rgx = re.compile(r'''
204+
^
205+
# Match a content type <application>/<type>
206+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
207+
# Match any character set and encoding
208+
# Note that this does not prevent the
209+
# same one being set twice
210+
# The charset group is currently unused
211+
(?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
212+
# Match the base64-encoded or urlencoded
213+
# data
214+
# The data group is currently unused
215+
(?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
216+
$
217+
''',
218+
re.VERBOSE)
219+
if uri.scheme == 'data':
220+
m = rgx.match(uri.path)
221+
if not m:
222+
del attrs[attr]
223+
if m.group('content_type') not in self.allowed_content_types:
224+
del attrs[attr]
225+
if m.group('encoding'):
226+
if m.group('encoding') == 'base64':
227+
# If the encoding identifier is base64, then
228+
# make sure the data is encoded in base64
229+
if not m.group('base64_encoded_data'):
230+
del attrs[attr]
231+
else:
232+
del attrs[attr]
233+
else:
234+
# If the encoding is not given, expect the data to
235+
# be urlencoded
236+
if not m.group('url_encoded_data'):
237+
del attrs[attr]
238+
196239
for attr in self.svg_attr_val_allows_ref:
197240
if attr in attrs:
198241
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

html5lib/tests/test_sanitizer.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def test_sanitizer():
8080
continue # TODO
8181
if attribute_name == 'style':
8282
continue
83+
attribute_value = 'foo'
84+
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
85+
attribute_value = 'http://sub.domain.tld/path/object.ext'
8386
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
84-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
85-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
87+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
88+
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
8689
toxml)
8790

8891
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
9396
toxml)
9497

9598
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
96-
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
97-
"<a href=\"%s\">foo</a>" % protocol,
98-
"""<a href="%s">foo</a>""" % protocol,
99+
rest_of_uri = '//sub.domain.tld/path/object.ext'
100+
if protocol == 'data':
101+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
102+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
104+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
99105
toxml)
100106

101107
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
108+
rest_of_uri = '//sub.domain.tld/path/object.ext'
109+
if protocol == 'data':
110+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
111+
protocol = protocol.upper()
102112
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103-
"<a href=\"%s\">foo</a>" % protocol,
104-
"""<a href="%s">foo</a>""" % protocol,
113+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
114+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
105115
toxml)

html5lib/tests/testdata

Submodule testdata updated 62 files

0 commit comments

Comments
 (0)