Skip to content

Commit 81176fc

Browse files
Drew HublCaptainCodeman
Drew Hubl
authored andcommitted
Don't check the encoding or the data, and switch charset and encoding to non-capturing subpatterns
1 parent ee798c2 commit 81176fc

File tree

1 file changed

+4
-21
lines changed

1 file changed

+4
-21
lines changed

html5lib/sanitizer.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,10 @@
1313
# Match a content type <application>/<type>
1414
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
1515
# Match any character set and encoding
16-
# Note that this does not prevent the
17-
# same one being set twice
18-
# The charset group is currently unused
19-
(?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
20-
# Match the base64-encoded or urlencoded
21-
# data
22-
# The data group is currently unused
23-
(?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
16+
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17+
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18+
# Assume the rest is data
19+
,.*
2420
$
2521
''',
2622
re.VERBOSE)
@@ -221,19 +217,6 @@ def allowed_token(self, token, token_type):
221217
del attrs[attr]
222218
if m.group('content_type') not in self.allowed_content_types:
223219
del attrs[attr]
224-
if m.group('encoding'):
225-
if m.group('encoding') == 'base64':
226-
# If the encoding identifier is base64, then
227-
# make sure the data is encoded in base64
228-
if not m.group('base64_encoded_data'):
229-
del attrs[attr]
230-
else:
231-
del attrs[attr]
232-
else:
233-
# If the encoding is not given, expect the data to
234-
# be urlencoded
235-
if not m.group('url_encoded_data'):
236-
del attrs[attr]
237220

238221
for attr in self.svg_attr_val_allows_ref:
239222
if attr in attrs:

0 commit comments

Comments
 (0)