Skip to content

Commit 6d23249

Browse files
author
Drew Hubl
committed
Rename and move content type regex to the top of the file so we only compile it once
1 parent af24793 commit 6d23249

File tree

1 file changed

+19
-17
lines changed

1 file changed

+19
-17
lines changed

html5lib/sanitizer.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,24 @@
88
from .constants import tokenTypes
99

1010

11+
content_type_rgx = re.compile(r'''
12+
^
13+
# Match a content type <application>/<type>
14+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15+
# Match any character set and encoding
16+
# Note that this does not prevent the
17+
# same one being set twice
18+
# The charset group is currently unused
19+
(?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
20+
# Match the base64-encoded or urlencoded
21+
# data
22+
# The data group is currently unused
23+
(?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
24+
$
25+
''',
26+
re.VERBOSE)
27+
28+
1129
class HTMLSanitizerMixin(object):
1230
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
1331

@@ -197,24 +215,8 @@ def allowed_token(self, token, token_type):
197215
if uri:
198216
if uri.scheme not in self.allowed_protocols:
199217
del attrs[attr]
200-
rgx = re.compile(r'''
201-
^
202-
# Match a content type <application>/<type>
203-
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
204-
# Match any character set and encoding
205-
# Note that this does not prevent the
206-
# same one being set twice
207-
# The charset group is currently unused
208-
(?:;charset=(?P<charset>[-a-zA-Z0-9]+)|;(?P<encoding>base64)){0,2}
209-
# Match the base64-encoded or urlencoded
210-
# data
211-
# The data group is currently unused
212-
(?P<data>,(?P<base64_encoded_data>[a-zA-Z0-9+/]+=*|(?P<url_encoded_data>[a-zA-Z0-9]+|%[a-fA-F0-9]{2})))
213-
$
214-
''',
215-
re.VERBOSE)
216218
if uri.scheme == 'data':
217-
m = rgx.match(uri.path)
219+
m = content_type_rgx.match(uri.path)
218220
if not m:
219221
del attrs[attr]
220222
if m.group('content_type') not in self.allowed_content_types:

0 commit comments

Comments
 (0)