From 2153a4cc126dfcfa1cabf19bd3a025624d646b65 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 15 Aug 2025 23:08:48 +0300
Subject: [PATCH] gh-137836: Support more RAWTEXT and PLAINTEXT elements in
 HTMLParser

* the "plaintext" element
* the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes"
* optionally RAWTEXT (if scripting=True) element "noscript"
---
 Doc/library/html.parser.rst                   |   8 +-
 Lib/html/parser.py                            |  17 +-
 Lib/test/test_htmlparser.py                   | 169 ++++++++++++++----
 ...-08-15-23-08-44.gh-issue-137836.b55rhh.rst |   3 +
 4 files changed, 160 insertions(+), 37 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst

diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index dd67fc34e856f1..81b9239185aab1 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -15,7 +15,7 @@
 This module defines a class :class:`HTMLParser` which serves as the basis for
 parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
 
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
 
    Create a parser instance able to parse invalid markup.
 
@@ -23,6 +23,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    references (except the ones in ``script``/``style`` elements) are
    automatically converted to the corresponding Unicode characters.
 
+   If *scripting* is true, the ``noscript`` element is parsed in the
+   RAWTEXT mode.
+
    An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
    when start tags, end tags, text, comments, and other markup elements are
    encountered.  The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
    .. versionchanged:: 3.5
       The default value for argument *convert_charrefs* is now ``True``.
 
+   .. versionchanged:: 3.13.8
+      Added the *scripting* parameter.
+
 
 Example HTML Parser Application
 -------------------------------
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 75bf8adae6d70a..79850fa6981d55 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
     argument.
     """
 
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # See the HTML5 specs section "13.4 Parsing HTML fragments".
+    # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
-    def __init__(self, *, convert_charrefs=True):
+    def __init__(self, *, convert_charrefs=True, scripting=False):
         """Initialize and reset this instance.
 
-        If convert_charrefs is True (the default), all character references
+        If convert_charrefs is true (the default), all character references
         are automatically converted to the corresponding Unicode characters.
+
+        If scripting is true, the noscript element is parsed in the
+        RAWTEXT mode.
         """
         super().__init__()
         self.convert_charrefs = convert_charrefs
+        self.scripting = scripting
         self.reset()
 
     def reset(self):
@@ -454,6 +460,11 @@ def parse_starttag(self, i):
                 self.set_cdata_mode(tag)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag, escapable=True)
+            elif self.scripting and tag == "noscript":
+                self.set_cdata_mode(tag)
+            elif tag == "plaintext":
+                self.set_cdata_mode(tag)
+                self.interesting = re.compile(r'\z')
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index fff41dab321acd..64cc6d8f1893f2 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -324,49 +324,138 @@ def test_style_content(self, content):
                             ("data", content),
                             ("endtag", "style")])
 
-    @support.subTests('content', [
-            '<!-- not a comment -->',
-            "<not a='start tag'>",
-            '<![CDATA[not a cdata]]>',
-            '<!not a bogus comment>',
-            '</not a bogus comment>',
-            '\u2603',
-            '< /title>',
-            '</ title>',
-            '</titled>',
-            '</title\v>',
-            '</title\xa0>',
-            '</tıtle>',
+    @support.subTests('tag', ['title', 'textarea'])
+    def test_rcdata_content(self, tag):
+        content = (
+            '<!-- not a comment -->'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", content),
+            ("endtag", tag),
         ])
-    def test_title_content(self, content):
-        source = f"<title>{content}</title>"
+        source = f"<{tag}>&amp;</{tag}>"
         self._run_check(source, [
-            ("starttag", "title", []),
+            ("starttag", tag, []),
+            ('entityref', 'amp'),
+            ("endtag", tag),
+        ])
+
+    @support.subTests('tag',
+            ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+    def test_rawtext_content(self, tag):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /{tag}>'
+            f'</ {tag}>'
+            f'</{tag}x>'
+            f'</{tag}\v>'
+            f'</{tag}\xa0>'
+        )
+        source = f"<{tag}>{content}</{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
             ("data", content),
-            ("endtag", "title"),
+            ("endtag", tag),
         ])
 
-    @support.subTests('content', [
-            '<!-- not a comment -->',
-            "<not a='start tag'>",
-            '<![CDATA[not a cdata]]>',
-            '<!not a bogus comment>',
-            '</not a bogus comment>',
-            '\u2603',
-            '< /textarea>',
-            '</ textarea>',
-            '</textareable>',
-            '</textarea\v>',
-            '</textarea\xa0>',
+    def test_noscript_content(self):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            f'< /noscript>'
+            f'</ noscript>'
+            f'</noscriptx>'
+            f'</noscript\v>'
+            f'</noscript\xa0>'
+        )
+        source = f"<noscript>{content}</noscript>"
+        self._run_check(source, [
+            ('starttag', 'noscript', []),
+            ('comment', ' not a comment '),
+            ('entityref', 'not'),
+            ('data', '-an-entity-ref;'),
+            ('starttag', 'not', [('a', 'start tag')]),
+            ('unknown decl', 'CDATA[not a cdata'),
+            ('comment', 'not a bogus comment'),
+            ('endtag', 'not'),
+            ('data', '☃< /noscript>'),
+            ('comment', ' noscript'),
+            ('endtag', 'noscriptx'),
+            ('endtag', 'noscript\x0b'),
+            ('endtag', 'noscript\xa0'),
+            ('endtag', 'noscript')
         ])
-    def test_textarea_content(self, content):
-        source = f"<textarea>{content}</textarea>"
         self._run_check(source, [
-            ("starttag", "textarea", []),
+            ("starttag", "noscript", []),
+            ("data", content),
+            ("endtag", "noscript"),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+    def test_plaintext_content(self):
+        content = (
+            '<!-- not a comment -->'
+            '&not-an-entity-ref;'
+            "<not a='start tag'>"
+            '<![CDATA[not a cdata]]>'
+            '<!not a bogus comment>'
+            '</not a bogus comment>'
+            '\u2603'
+            '</plaintext>'
+        )
+        source = f"<plaintext>{content}"
+        self._run_check(source, [
+            ("starttag", "plaintext", []),
             ("data", content),
-            ("endtag", "textarea"),
         ])
 
+    @support.subTests('tag,endtag', [
+            ('title', 'tıtle'),
+            ('style', 'ſtyle'),
+            ('style', 'ﬅyle'),
+            ('style', 'ﬆyle'),
+            ('iframe', 'ıframe'),
+            ('noframes', 'noframeſ'),
+            ('noscript', 'noſcript'),
+            ('noscript', 'noscrıpt'),
+            ('script', 'ſcript'),
+            ('script', 'scrıpt'),
+        ])
+    def test_invalid_nonascii_closing_tag(self, tag, endtag):
+        source = f"<{tag}><a></{endtag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", f"<a></{endtag}>"),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+        source = f"<{tag}><a></{endtag}></{tag}>"
+        self._run_check(source, [
+            ("starttag", tag, []),
+            ("data", f"<a></{endtag}>"),
+            ("endtag", tag),
+        ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
     @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                  'script/', 'script foo=bar', 'script foo=">"'])
     def test_script_closing_tag(self, endtag):
@@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag):
                             ("endtag", "textarea")],
                         collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
+                                   'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
+                                   'ScrIPt'])
+    def test_closing_tag(self, starttag):
+        tag = starttag.lower()
+        for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
+                       f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
+            content = "<!-- not a comment --><i>Spam</i>"
+            s = f'<{starttag}>{content}</{endtag}>'
+            self._run_check(s, [("starttag", tag, []),
+                                ('data', content),
+                                ("endtag", tag)],
+                            collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))
+
     @support.subTests('tail,end', [
         ('', False),
         ('<', False),
diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
new file mode 100644
index 00000000000000..c30c9439a76a19
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
@@ -0,0 +1,3 @@
+Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
+"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
+:class:`html.parser.HTMLParser`.