gh-69426: only unescape properly terminated character entities in att…

…ribute values
python · serhiy-storchaka · May 7, 2025 · Jul 24, 2022 · Jul 24, 2022 · Jan 14, 2023
commit 71a89f98c31e2f1285221568de73fbc1e09ad84d
@@ -12,6 +12,7 @@
 import _markupbase
 
 from html import unescape
+from html.entities import html5 as html5_entities
 
 
 __all__ = ['HTMLParser']
@@ -57,6 +58,26 @@
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
+
+def replace_attr_charref(match):
+    ref = match.group(0)
+    # Numeric / hex char refs must always be unescaped
+    if ref[1] == '#':
-    if ref[1] == '#':
+    if ref.startswith('&#'):
-    if ref[1] == '#':
+    if ref.startswith('&#'):
+        return unescape(ref)
+    # Named character / entity references must only be unescaped
+    # if they are an exact match, and they are not followed by an equals sign
+    terminates_with_equals = ref[-1:] == '='
-    terminates_with_equals = ref[-1:] == '='
+    terminates_with_equals = ref.endswith('=')
-    terminates_with_equals = ref[-1:] == '='
+    terminates_with_equals = ref.endswith('=')
+    exact_match = ref.lstrip('&').rstrip('=') in html5_entities
+    if exact_match and not terminates_with_equals:
+        return unescape(ref)
+    # Otherwise do not unescape
+    return ref
+
+def unescape_attrvalue(s):
+    return attr_charref.sub(replace_attr_charref, s)
 
 
 class HTMLParser(_markupbase.ParserBase):
@@ -322,7 +343,7 @@ def parse_starttag(self, i):
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
-                attrvalue = unescape(attrvalue)
+                attrvalue = unescape_attrvalue(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()
 

@@ -347,17 +347,17 @@ def test_convert_charrefs(self):
         self.assertTrue(collector().convert_charrefs)
         charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
         # check charrefs in the middle of the text/attributes
-        expected = [('starttag', 'a', [('href', 'foo"zar')]),
+        expected = [('starttag', 'a', [('href', 'foo " zar')]),
                     ('data', 'a"z'), ('endtag', 'a')]
         for charref in charrefs:
-            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+            self._run_check('<a href="foo {0} zar">a{0}z</a>'.format(charref),
                             expected, collector=collector())
-        # check charrefs at the beginning/end of the text/attributes
+        # check charrefs at the beginning/end of the text
         expected = [('data', '"'),
-                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+                    ('starttag', 'a', []),
                     ('data', '"'), ('endtag', 'a'), ('data', '"')]
         for charref in charrefs:
-            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+            self._run_check('{0}<a>'
                             '{0}</a>{0}'.format(charref),
                             expected, collector=collector())
         # check charrefs in <script>/<style> elements
@@ -380,6 +380,48 @@ def test_convert_charrefs(self):
         self._run_check('no charrefs here', [('data', 'no charrefs here')],
                         collector=collector())
 
+    def test_convert_charrefs_in_attribute_values(self):
+        # default value for convert_charrefs is now True
+        collector = lambda: EventCollectorCharrefs()
+        self.assertTrue(collector().convert_charrefs)
+
+        # do unescape numeric and hex char refs
+        expected = [('starttag', 'a',
+                     [('href', 'https://example.com?foo¢=bar¢&baz¢=bla¢')]),
+                    ('endtag', 'a')]
+        self._run_check('<a href="https://example.com?foo&#xa2;=bar&#xa2&baz&#162;=bla&#162"></a>', expected, collector=collector())
+
+        # do unescape entity matches not followed by ASCII alphanumeric
+        expected = [('starttag', 'a',
+                     [('href', 'https://example.com?foo¢¢ ¢+¢')]),
+                    ('endtag', 'a')]
+        self._run_check('<a href="https://example.com?foo&cent;&cent &cent+&cent"></a>', expected, collector=collector())
+
+        # do not unescape entity matches followed by ASCII alphanumeric
+        expected = [('starttag', 'a',
+                     [('href', 'https://example.com?foo&center&cent123')]),
+                    ('endtag', 'a')]
+        self._run_check('<a href="https://example.com?foo&center&cent123"></a>', expected, collector=collector())
+
+        # do not unescape entity matches followed by equals
+        expected = [('starttag', 'a',
+                     [('href', 'https://example.com?foo&cent=123')]),
+                    ('endtag', 'a')]
+        self._run_check('<a href="https://example.com?foo&cent=123"></a>', expected, collector=collector())
+
+        # do unescape terminated entity matches followed by equals
+        expected = [('starttag', 'a',
+                     [('href', 'https://example.com?foo¢=123')]),
+                    ('endtag', 'a')]
+        self._run_check('<a href="https://example.com?foo&cent;=123"></a>', expected, collector=collector())
+
+        # do unescape char refs at begging and end of text attributes
+        charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
+        expected = [('starttag', 'a', [('x', '"'), ('y', '"-X'), ('z', 'X-"')]), ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a x="{0}" y="{0}-X" z="X-{0}"></a>'.format(charref),
+                            expected, collector=collector())
+
     # the remaining tests were for the "tolerant" parser (which is now
     # the default), and check various kind of broken markup
     def test_tolerant_parsing(self):

diff --git a/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst b/Misc/NEWS.d/next/Library/2022-07-24-20-56-32.gh-issue-69426.unccw7.rst
@@ -0,0 +1,2 @@
+Fix :class:`HTMLParser` to not unescape character entities in attribute
+values if they are followed by an ASCII alphanumeric or an equals sign.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Fix :class:`HTMLParser` to not unescape character entities in attribute
		values if they are followed by an ASCII alphanumeric or an equals sign.