Skip to content

Commit 52f9ca6

Browse files
mfagsnedders
authored andcommitted
refactor allowed_token and disallowed_token as new methods in HTMLSanitizerMixin for usage in subclass.
1 parent 90aa9f4 commit 52f9ca6

File tree

1 file changed

+51
-45
lines changed

1 file changed

+51
-45
lines changed

html5lib/sanitizer.py

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -168,57 +168,63 @@ def sanitize_token(self, token):
168168
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
169169
tokenTypes["EmptyTag"]):
170170
if token["name"] in self.allowed_elements:
171-
if "data" in token:
172-
attrs = dict([(name,val) for name,val in
173-
token["data"][::-1]
174-
if name in self.allowed_attributes])
175-
for attr in self.attr_val_is_uri:
176-
if attr not in attrs:
177-
continue
178-
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
179-
unescape(attrs[attr])).lower()
180-
#remove replacement characters from unescaped characters
181-
val_unescaped = val_unescaped.replace("\ufffd", "")
182-
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
183-
(val_unescaped.split(':')[0] not in
184-
self.allowed_protocols)):
185-
del attrs[attr]
186-
for attr in self.svg_attr_val_allows_ref:
187-
if attr in attrs:
188-
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
189-
' ',
190-
unescape(attrs[attr]))
191-
if (token["name"] in self.svg_allow_local_href and
192-
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
193-
attrs['xlink:href'])):
194-
del attrs['xlink:href']
195-
if 'style' in attrs:
196-
attrs['style'] = self.sanitize_css(attrs['style'])
197-
token["data"] = [[name,val] for name,val in list(attrs.items())]
198-
return token
171+
return self.allowed_token(token, token_type)
199172
else:
200-
if token_type == tokenTypes["EndTag"]:
201-
token["data"] = "</%s>" % token["name"]
202-
elif token["data"]:
203-
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
204-
token["data"] = "<%s%s>" % (token["name"],attrs)
205-
else:
206-
token["data"] = "<%s>" % token["name"]
207-
if token.get("selfClosing"):
208-
token["data"]=token["data"][:-1] + "/>"
209-
210-
if token["type"] in list(tokenTypes.keys()):
211-
token["type"] = "Characters"
212-
else:
213-
token["type"] = tokenTypes["Characters"]
214-
215-
del token["name"]
216-
return token
173+
return self.disallowed_token(token, token_type)
217174
elif token_type == tokenTypes["Comment"]:
218175
pass
219176
else:
220177
return token
221178

179+
def allowed_token(self, token, token_type):
180+
if "data" in token:
181+
attrs = dict([(name,val) for name,val in
182+
token["data"][::-1]
183+
if name in self.allowed_attributes])
184+
for attr in self.attr_val_is_uri:
185+
if attr not in attrs:
186+
continue
187+
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
188+
unescape(attrs[attr])).lower()
189+
#remove replacement characters from unescaped characters
190+
val_unescaped = val_unescaped.replace("\ufffd", "")
191+
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
192+
(val_unescaped.split(':')[0] not in
193+
self.allowed_protocols)):
194+
del attrs[attr]
195+
for attr in self.svg_attr_val_allows_ref:
196+
if attr in attrs:
197+
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
198+
' ',
199+
unescape(attrs[attr]))
200+
if (token["name"] in self.svg_allow_local_href and
201+
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
202+
attrs['xlink:href'])):
203+
del attrs['xlink:href']
204+
if 'style' in attrs:
205+
attrs['style'] = self.sanitize_css(attrs['style'])
206+
token["data"] = [[name,val] for name,val in list(attrs.items())]
207+
return token
208+
209+
def disallowed_token(self, token, token_type):
210+
if token_type == tokenTypes["EndTag"]:
211+
token["data"] = "</%s>" % token["name"]
212+
elif token["data"]:
213+
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
214+
token["data"] = "<%s%s>" % (token["name"],attrs)
215+
else:
216+
token["data"] = "<%s>" % token["name"]
217+
if token.get("selfClosing"):
218+
token["data"]=token["data"][:-1] + "/>"
219+
220+
if token["type"] in list(tokenTypes.keys()):
221+
token["type"] = "Characters"
222+
else:
223+
token["type"] = tokenTypes["Characters"]
224+
225+
del token["name"]
226+
return token
227+
222228
def sanitize_css(self, style):
223229
# disallow urls
224230
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)

0 commit comments

Comments
 (0)