2
2
3
3
import re
4
4
from xml .sax .saxutils import escape , unescape
5
+ from six .moves import urllib_parse as urlparse
5
6
6
7
from .tokenizer import HTMLTokenizer
7
8
from .constants import tokenTypes
8
9
9
10
11
+ content_type_rgx = re .compile (r'''
12
+ ^
13
+ # Match a content type <application>/<type>
14
+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15
+ # Match any character set and encoding
16
+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17
+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18
+ # Assume the rest is data
19
+ ,.*
20
+ $
21
+ ''' ,
22
+ re .VERBOSE )
23
+
24
+
10
25
class HTMLSanitizerMixin (object ):
11
26
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
12
27
@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138
153
acceptable_protocols = ['ed2k' , 'ftp' , 'http' , 'https' , 'irc' ,
139
154
'mailto' , 'news' , 'gopher' , 'nntp' , 'telnet' , 'webcal' ,
140
155
'xmpp' , 'callto' , 'feed' , 'urn' , 'aim' , 'rsync' , 'tag' ,
141
- 'ssh' , 'sftp' , 'rtsp' , 'afs' ]
156
+ 'ssh' , 'sftp' , 'rtsp' , 'afs' , 'data' ]
157
+
158
+ acceptable_content_types = ['image/png' , 'image/jpeg' , 'image/gif' , 'image/webp' , 'image/bmp' , 'text/plain' ]
142
159
143
160
# subclasses may define their own versions of these constants
144
161
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147
164
allowed_css_keywords = acceptable_css_keywords
148
165
allowed_svg_properties = acceptable_svg_properties
149
166
allowed_protocols = acceptable_protocols
167
+ allowed_content_types = acceptable_content_types
150
168
151
169
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152
170
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189
207
unescape (attrs [attr ])).lower ()
190
208
# remove replacement characters from unescaped characters
191
209
val_unescaped = val_unescaped .replace ("\ufffd " , "" )
192
- if (re .match ("^[a-z0-9][-+.a-z0-9]*:" , val_unescaped ) and
193
- (val_unescaped .split (':' )[0 ] not in
194
- self .allowed_protocols )):
195
- del attrs [attr ]
210
+ uri = urlparse .urlparse (val_unescaped )
211
+ if uri :
212
+ if uri .scheme not in self .allowed_protocols :
213
+ del attrs [attr ]
214
+ if uri .scheme == 'data' :
215
+ m = content_type_rgx .match (uri .path )
216
+ if not m :
217
+ del attrs [attr ]
218
+ if m .group ('content_type' ) not in self .allowed_content_types :
219
+ del attrs [attr ]
220
+
196
221
for attr in self .svg_attr_val_allows_ref :
197
222
if attr in attrs :
198
223
attrs [attr ] = re .sub (r'url\s*\(\s*[^#\s][^)]+?\)' ,
0 commit comments