Skip to content

gh-69426: only unescape properly terminated character entities in attribute values #95215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
20 changes: 19 additions & 1 deletion Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import _markupbase

from html import unescape
from html.entities import html5 as html5_entities


__all__ = ['HTMLParser']
Expand All @@ -23,6 +24,7 @@

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why there are . and - symbols in the name here? It may not be related to this issue.

charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')

starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
Expand Down Expand Up @@ -57,6 +59,22 @@
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

# Character reference processing logic specific to attribute values
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
def _replace_attr_charref(match):
ref = match.group(0)
# Numeric / hex char refs must always be unescaped
if ref.startswith('&#'):
return unescape(ref)
# Named character / entity references must only be unescaped
# if they are an exact match, and they are not followed by an equals sign
if not ref.endswith('=') and ref[1:] in html5_entities:
return unescape(ref)
# Otherwise do not unescape
return ref

def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)


class HTMLParser(_markupbase.ParserBase):
Expand Down Expand Up @@ -323,7 +341,7 @@ def parse_starttag(self, i):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = unescape(attrvalue)
attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()

Expand Down
43 changes: 35 additions & 8 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,18 +348,16 @@ def test_convert_charrefs(self):
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
# check charrefs in the middle of the text/attributes
expected = [('starttag', 'a', [('href', 'foo"zar')]),
('data', 'a"z'), ('endtag', 'a')]
# check charrefs in the middle of the text
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
for charref in charrefs:
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
self._run_check('<a>a{0}z</a>'.format(charref),
expected, collector=collector())
# check charrefs at the beginning/end of the text/attributes
expected = [('data', '"'),
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
# check charrefs at the beginning/end of the text
expected = [('data', '"'), ('starttag', 'a', []),
('data', '"'), ('endtag', 'a'), ('data', '"')]
for charref in charrefs:
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
self._run_check('{0}<a>'
'{0}</a>{0}'.format(charref),
expected, collector=collector())
# check charrefs in <script>/<style> elements
Expand All @@ -382,6 +380,35 @@ def test_convert_charrefs(self):
self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector())

def test_convert_charrefs_in_attribute_values(self):
# default value for convert_charrefs is now True
collector = lambda: EventCollectorCharrefs()
self.assertTrue(collector().convert_charrefs)

# always unescape terminated entity refs, numeric and hex char refs:
# - regardless whether they are at start, middle, end of attribute
# - or followed by alphanumeric, non-alphanumeric, or equals char
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
expected = [('starttag', 'a',
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
('endtag', 'a')]
for charref in charrefs:
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
.format(charref), expected, collector=collector())

# only unescape unterminated entity matches if they are not followed by
# an alphanumeric or an equals sign
charref = '&cent'
expected = [('starttag', 'a',
[('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
('endtag', 'a')]
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
.format(charref), expected, collector=collector())

# the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup
def test_tolerant_parsing(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix :class:`html.parser.HTMLParser` to not unescape character entities in
attribute values if they are followed by an ASCII alphanumeric or an equals
sign.
Loading