2
2
3
3
import re
4
4
from xml .sax .saxutils import escape , unescape
5
+ try :
6
+ from urllib .parse import urlparse
7
+ except ImportError :
8
+ from urlparse import urlparse
5
9
6
10
from .tokenizer import HTMLTokenizer
7
11
from .constants import tokenTypes
@@ -138,7 +142,9 @@ class HTMLSanitizerMixin(object):
138
142
acceptable_protocols = ['ed2k' , 'ftp' , 'http' , 'https' , 'irc' ,
139
143
'mailto' , 'news' , 'gopher' , 'nntp' , 'telnet' , 'webcal' ,
140
144
'xmpp' , 'callto' , 'feed' , 'urn' , 'aim' , 'rsync' , 'tag' ,
141
- 'ssh' , 'sftp' , 'rtsp' , 'afs' ]
145
+ 'ssh' , 'sftp' , 'rtsp' , 'afs' , 'data' ]
146
+
147
+ acceptable_content_types = ['image/png' ]
142
148
143
149
# subclasses may define their own versions of these constants
144
150
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +153,7 @@ class HTMLSanitizerMixin(object):
147
153
allowed_css_keywords = acceptable_css_keywords
148
154
allowed_svg_properties = acceptable_svg_properties
149
155
allowed_protocols = acceptable_protocols
156
+ allowed_content_types = acceptable_content_types
150
157
151
158
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152
159
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +196,46 @@ def allowed_token(self, token, token_type):
189
196
unescape (attrs [attr ])).lower ()
190
197
# remove replacement characters from unescaped characters
191
198
val_unescaped = val_unescaped .replace ("\ufffd " , "" )
192
- if (re .match ("^[a-z0-9][-+.a-z0-9]*:" , val_unescaped ) and
193
- (val_unescaped .split (':' )[0 ] not in
194
- self .allowed_protocols )):
195
- del attrs [attr ]
199
+ uri = urlparse (val_unescaped )
200
+ if uri :
201
+ if uri .scheme not in self .allowed_protocols :
202
+ del attrs [attr ]
203
+ rgx = re .compile (r'''
204
+ ^
205
+ # Match a content type <application>/<type>
206
+ (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
207
+ # Match any character set and encoding
208
+ # Note that this does not prevent the
209
+ # same one being set twice
210
+ # The charset group is currently unused
211
+ (?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
212
+ # Match the base64-encoded or urlencoded
213
+ # data
214
+ # The data group is currently unused
215
+ (?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
216
+ $
217
+ ''' ,
218
+ re .VERBOSE )
219
+ if uri .scheme == 'data' :
220
+ m = rgx .match (uri .path )
221
+ if not m :
222
+ del attrs [attr ]
223
+ if m .group ('content_type' ) not in self .allowed_content_types :
224
+ del attrs [attr ]
225
+ if m .group ('encoding' ):
226
+ if m .group ('encoding' ) == 'base64' :
227
+ # If the encoding identifier is base64, then
228
+ # make sure the data is encoded in base64
229
+ if not m .group ('base64_encoded_data' ):
230
+ del attrs [attr ]
231
+ else :
232
+ del attrs [attr ]
233
+ else :
234
+ # If the encoding is not given, expect the data to
235
+ # be urlencoded
236
+ if not m .group ('url_encoded_data' ):
237
+ del attrs [attr ]
238
+
196
239
for attr in self .svg_attr_val_allows_ref :
197
240
if attr in attrs :
198
241
attrs [attr ] = re .sub (r'url\s*\(\s*[^#\s][^)]+?\)' ,
0 commit comments