* Make CDATA section parsing context depending.

* Add HTMLParser.support_cdata().
python · serhiy-storchaka · Jun 18, 2025 · Jul 3, 2025 · Jul 3, 2025 · Jul 4, 2025
commit 524cac599dc5554650e6f1a8c81d808fa8ef54d6
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
@@ -121,6 +121,17 @@ The output will then be:
    attributes can be preserved, etc.).
 
 
+.. method:: HTMLParser.support_cdata(flag)
+
+   Sets how the parser will parse CDATA declarations.
+   If *flag* is true, then the :meth:`unknown_decl` method will be called
+   for the CDATA section ``<![CDATA[...]]>``.
+   If *flag* is false, then the :meth:`handle_comment` method will be called
+   for ``<![CDATA[...>``.
-   If *flag* is false, then the :meth:`handle_comment` method will be called
-   for ``<![CDATA[...>``.
+   If *flag* is false, or if :meth:`!support_cdata` has not been called yet,
+   then the :meth:`handle_comment` method will be called for ``<![CDATA[...>``.
-   If *flag* is false, then the :meth:`handle_comment` method will be called
-   for ``<![CDATA[...>``.
+   If *flag* is false, or if :meth:`!support_cdata` has not been called yet,
+   then the :meth:`handle_comment` method will be called for ``<![CDATA[...>``.
+
+   .. versionadded:: 3.13.6
-   .. versionadded:: 3.13.6
+   .. versionadded:: 3.13.6
+   
+      Previously, :meth:`unknown_decl` was called for ``<![CDATA[...>``.
-   .. versionadded:: 3.13.6
+   .. versionadded:: 3.13.6
+   
+      Previously, :meth:`unknown_decl` was called for ``<![CDATA[...>``.
+
+
 The following methods are called when data or markup elements are encountered
 and they are meant to be overridden in a subclass.  The base class
 implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):

@@ -144,6 +144,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._support_cdata = False
         super().reset()
 
     def feed(self, data):
@@ -174,6 +175,9 @@ def clear_cdata_mode(self):
         self.interesting = interesting_normal
         self.cdata_elem = None
 
+    def support_cdata(self, flag=True):
+        self._support_cdata = flag
+
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
     # true, force handling all data as if followed by EOF marker.
@@ -249,7 +253,10 @@ def goahead(self, end):
                                 break
                         self.handle_comment(rawdata[i+4:j])
                     elif startswith("<![CDATA[", i):
-                        self.unknown_decl(rawdata[i+3:])
+                        if self._support_cdata:
+                            self.unknown_decl(rawdata[i+3:])
+                        else:
+                            self.handle_comment(rawdata[i+1:])
                     elif rawdata[i:i+9].lower() == '<!doctype':
                         self.handle_decl(rawdata[i+2:])
                     elif startswith("<!", i):
@@ -325,11 +332,14 @@ def parse_html_declaration(self, i):
             # this case is actually already handled in goahead()
             return self.parse_comment(i)
         elif rawdata[i:i+9] == '<![CDATA[':
-            j = rawdata.find(']]>')
-            if j < 0:
-                return -1
-            self.unknown_decl(rawdata[i+3: j])
-            return j + 3
+            if self._support_cdata:
+                j = rawdata.find(']]>', i+9)
+                if j < 0:
+                    return -1
+                self.unknown_decl(rawdata[i+3: j])
+                return j + 3
+            else:
+                return self.parse_bogus_comment(i)
         elif rawdata[i:i+9].lower() == '<!doctype':
             # find the closing >
             gtpos = rawdata.find('>', i+9)

@@ -34,12 +34,16 @@ def get_events(self):
 
     def handle_starttag(self, tag, attrs):
         self.append(("starttag", tag, attrs))
+        if tag == 'svg':
+            self.support_cdata(True)
 
     def handle_startendtag(self, tag, attrs):
         self.append(("startendtag", tag, attrs))
 
     def handle_endtag(self, tag):
         self.append(("endtag", tag))
+        if tag == 'svg':
+            self.support_cdata(False)
 
     # all other markup
 
@@ -643,10 +647,22 @@ def test_eof_in_declarations(self):
             ('<!', [('comment', '')]),
             ('<!-', [('comment', '-')]),
             ('<![', [('comment', '[')]),
-            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
-            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
-            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
-            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
+            ('<![CDATA[', [('comment', '![CDATA[')]),
+            ('<![CDATA[x', [('comment', '![CDATA[x')]),
+            ('<![CDATA[x]', [('comment', '![CDATA[x]')]),
+            ('<![CDATA[x]]', [('comment', '![CDATA[x]]')]),
+            ('<svg><text y="100"><![CDATA[',
+             [('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
+              ('unknown decl', 'CDATA[')]),
+            ('<svg><text y="100"><![CDATA[x',
+             [('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
+              ('unknown decl', 'CDATA[x')]),
+            ('<svg><text y="100"><![CDATA[x]',
+             [('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
+              ('unknown decl', 'CDATA[x]')]),
+            ('<svg><text y="100"><![CDATA[x]]',
+             [('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
+              ('unknown decl', 'CDATA[x]]')]),
             ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
             ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
             ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -737,11 +753,35 @@ def test_broken_condcoms(self):
          '        printf("[<marquee>How?</marquee>]");\n'
          '    }\n'),
     ])
-    def test_cdata_section(self, content):
+    def test_cdata_section_content(self, content):
         # See "13.2.5.42 Markup declaration open state",
         # "13.2.5.69 CDATA section state", and issue bpo-32876.
-        html = f'<![CDATA[{content}]]>'
-        expected = [('unknown decl', 'CDATA[' + content)]
+        html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
+        expected = [
+            ('starttag', 'svg', []),
+            ('starttag', 'text', [('y', '100')]),
+            ('unknown decl', 'CDATA[' + content),
+            ('endtag', 'text'),
+            ('endtag', 'svg'),
+        ]
+        self._run_check(html, expected)
+
+    def test_cdata_section(self):
+        # See "13.2.5.42 Markup declaration open state".
+        html = ('<![CDATA[foo<br>bar]]>'
+                '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
+                '<![CDATA[foo<br>bar]]>')
+        expected = [
+            ('comment', '[CDATA[foo<br'),
+            ('data', 'bar]]>'),
+            ('starttag', 'svg', []),
+            ('starttag', 'text', [('y', '100')]),
+            ('unknown decl', 'CDATA[foo<br>bar'),
+            ('endtag', 'text'),
+            ('endtag', 'svg'),
+            ('comment', '[CDATA[foo<br'),
+            ('data', 'bar]]>'),
+        ]
         self._run_check(html, expected)
 
     def test_convert_charrefs_dropped_text(self):