gh-135661: Fix CDATA section parsing in HTMLParser

"] ]>" and "]] >" no longer end the CDATA section.
python · serhiy-storchaka · Jun 18, 2025 · Jul 3, 2025 · Jul 3, 2025 · Jul 4, 2025
commit f7f9f562f1b31c2130e26269cf4f196f378d80f2
@@ -298,7 +298,11 @@ def parse_html_declaration(self, i):
             # this case is actually already handled in goahead()
             return self.parse_comment(i)
         elif rawdata[i:i+9] == '<![CDATA[':
-            return self.parse_marked_section(i)
+            j = rawdata.find(']]>')
+            if j < 0:
+                return -1
+            self.unknown_decl(rawdata[i+3: j])
-            self.unknown_decl(rawdata[i+3: j])
+            self.unknown_decl(rawdata[i+3:j])
-            self.unknown_decl(rawdata[i+3: j])
+            self.unknown_decl(rawdata[i+3:j])
+            return j + 3
         elif rawdata[i:i+9].lower() == '<!doctype':
             # find the closing >
             gtpos = rawdata.find('>', i+9)

@@ -686,27 +686,27 @@ def test_broken_condcoms(self):
         ]
         self._run_check(html, expected)
 
-    def test_cdata_declarations(self):
-        # More tests should be added. See also "8.2.4.42. Markup
-        # declaration open state", "8.2.4.69. CDATA section state",
-        # and issue 32876
-        html = ('<![CDATA[just some plain text]]>')
-        expected = [('unknown decl', 'CDATA[just some plain text')]
-        self._run_check(html, expected)
-
-    def test_cdata_declarations_multiline(self):
-        html = ('<code><![CDATA['
-                '    if (a < b && a > b) {'
-                '        printf("[<marquee>How?</marquee>]");'
-                '    }'
-                ']]></code>')
-        expected = [
-            ('starttag', 'code', []),
-            ('unknown decl',
-             'CDATA[    if (a < b && a > b) {        '
-             'printf("[<marquee>How?</marquee>]");    }'),
-            ('endtag', 'code')
-        ]
+    @support.subTests('content', [
+        'just some plain text',
+        '<!-- not a comment -->',
+        '&not-an-entity-ref;',
+        "<not a='start tag'>",
+        '',
+        '[[I have many brackets]]',
+        'I have a > in the middle',
+        'I have a ]] in the middle',
+        '] ]>',
+        ']] >',
+        ('\n'
+         '    if (a < b && a > b) {\n'
+         '        printf("[<marquee>How?</marquee>]");\n'
+         '    }\n'),
+    ])
+    def test_cdata_section(self, content):
+        # See "13.2.5.42 Markup declaration open state",
+        # "13.2.5.69 CDATA section state", and issue bpo-32876.
+        html = f'<![CDATA[{content}]]>'
+        expected = [('unknown decl', 'CDATA[' + content)]
         self._run_check(html, expected)
 
     def test_convert_charrefs_dropped_text(self):

diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
@@ -0,0 +1,2 @@
+Fix CDATA section parsing in :class:`html.parser.HTMLParser`: ``] ]>`` and
+``]] >`` no longer end the CDATA section.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Fix CDATA section parsing in :class:`html.parser.HTMLParser`: ``] ]>`` and
		``]] >`` no longer end the CDATA section.