From 2153a4cc126dfcfa1cabf19bd3a025624d646b65 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 15 Aug 2025 23:08:48 +0300 Subject: [PATCH] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser * the "plaintext" element * the RAWTEXT elements "xmp", "iframe", "noembed" and "noframes" * optionally RAWTEXT (if scripting=True) element "noscript" --- Doc/library/html.parser.rst | 8 +- Lib/html/parser.py | 17 +- Lib/test/test_htmlparser.py | 169 ++++++++++++++---- ...-08-15-23-08-44.gh-issue-137836.b55rhh.rst | 3 + 4 files changed, 160 insertions(+), 37 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index dd67fc34e856f1..81b9239185aab1 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -15,7 +15,7 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(*, convert_charrefs=True) +.. class:: HTMLParser(*, convert_charrefs=True, scripting=False) Create a parser instance able to parse invalid markup. @@ -23,6 +23,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. references (except the ones in ``script``/``style`` elements) are automatically converted to the corresponding Unicode characters. + If *scripting* is true, the ``noscript`` element is parsed in the + RAWTEXT mode. + An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered. The user should subclass :class:`.HTMLParser` and override its @@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. .. versionchanged:: 3.5 The default value for argument *convert_charrefs* is now ``True``. + .. versionchanged:: 3.13.8 + Added the *scripting* parameter. + Example HTML Parser Application ------------------------------- diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 75bf8adae6d70a..79850fa6981d55 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase): argument. """ - CDATA_CONTENT_ELEMENTS = ("script", "style") + # See the HTML5 specs section "13.4 Parsing HTML fragments". + # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments + CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes") RCDATA_CONTENT_ELEMENTS = ("textarea", "title") - def __init__(self, *, convert_charrefs=True): + def __init__(self, *, convert_charrefs=True, scripting=False): """Initialize and reset this instance. - If convert_charrefs is True (the default), all character references + If convert_charrefs is true (the default), all character references are automatically converted to the corresponding Unicode characters. + + If scripting is true, the noscript element is parsed in the + RAWTEXT mode. """ super().__init__() self.convert_charrefs = convert_charrefs + self.scripting = scripting self.reset() def reset(self): @@ -454,6 +460,11 @@ def parse_starttag(self, i): self.set_cdata_mode(tag) elif tag in self.RCDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag, escapable=True) + elif self.scripting and tag == "noscript": + self.set_cdata_mode(tag) + elif tag == "plaintext": + self.set_cdata_mode(tag) + self.interesting = re.compile(r'\z') return endpos # Internal -- check to see if we have a complete starttag; return end diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index fff41dab321acd..64cc6d8f1893f2 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -324,49 +324,138 @@ def test_style_content(self, content): ("data", content), ("endtag", "style")]) - @support.subTests('content', [ - '', - "", - '', - '', - '', - '\u2603', - '< /title>', - '', - '', - '', - '', - '', + @support.subTests('tag', ['title', 'textarea']) + def test_rcdata_content(self, tag): + content = ( + '' + "" + '' + '' + '' + '\u2603' + f'< /{tag}>' + f'' + f'' + f'' + f'' + ) + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), + ("data", content), + ("endtag", tag), ]) - def test_title_content(self, content): - source = f"{content}" + source = f"<{tag}>&" self._run_check(source, [ - ("starttag", "title", []), + ("starttag", tag, []), + ('entityref', 'amp'), + ("endtag", tag), + ]) + + @support.subTests('tag', + ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script']) + def test_rawtext_content(self, tag): + content = ( + '' + '¬-an-entity-ref;' + "" + '' + '' + '' + '\u2603' + f'< /{tag}>' + f'' + f'' + f'' + f'' + ) + source = f"<{tag}>{content}" + self._run_check(source, [ + ("starttag", tag, []), ("data", content), - ("endtag", "title"), + ("endtag", tag), ]) - @support.subTests('content', [ - '', - "", - '', - '', - '', - '\u2603', - '< /textarea>', - '', - '', - '', - '', + def test_noscript_content(self): + content = ( + '' + '¬-an-entity-ref;' + "" + '' + '' + '' + '\u2603' + f'< /noscript>' + f'' + f'' + f'' + f'' + ) + source = f"" + self._run_check(source, [ + ('starttag', 'noscript', []), + ('comment', ' not a comment '), + ('entityref', 'not'), + ('data', '-an-entity-ref;'), + ('starttag', 'not', [('a', 'start tag')]), + ('unknown decl', 'CDATA[not a cdata'), + ('comment', 'not a bogus comment'), + ('endtag', 'not'), + ('data', '☃< /noscript>'), + ('comment', ' noscript'), + ('endtag', 'noscriptx'), + ('endtag', 'noscript\x0b'), + ('endtag', 'noscript\xa0'), + ('endtag', 'noscript') ]) - def test_textarea_content(self, content): - source = f"" self._run_check(source, [ - ("starttag", "textarea", []), + ("starttag", "noscript", []), + ("data", content), + ("endtag", "noscript"), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + + def test_plaintext_content(self): + content = ( + '' + '¬-an-entity-ref;' + "" + '' + '' + '' + '\u2603' + '' + ) + source = f"{content}" + self._run_check(source, [ + ("starttag", "plaintext", []), ("data", content), - ("endtag", "textarea"), ]) + @support.subTests('tag,endtag', [ + ('title', 'tıtle'), + ('style', 'ſtyle'), + ('style', 'ſtyle'), + ('style', 'style'), + ('iframe', 'ıframe'), + ('noframes', 'noframeſ'), + ('noscript', 'noſcript'), + ('noscript', 'noscrıpt'), + ('script', 'ſcript'), + ('script', 'scrıpt'), + ]) + def test_invalid_nonascii_closing_tag(self, tag, endtag): + source = f"<{tag}><a></{endtag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", f"<a></{endtag}>"), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + source = f"<{tag}><a></{endtag}></{tag}>" + self._run_check(source, [ + ("starttag", tag, []), + ("data", f"<a></{endtag}>"), + ("endtag", tag), + ], collector=EventCollector(convert_charrefs=False, scripting=True)) + @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n', 'script/', 'script foo=bar', 'script foo=">"']) def test_script_closing_tag(self, endtag): @@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag): ("endtag", "textarea")], collector=EventCollectorNoNormalize(convert_charrefs=False)) + @support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP', + 'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt', + 'ScrIPt']) + def test_closing_tag(self, starttag): + tag = starttag.lower() + for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n', + f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']: + content = "<!-- not a comment --><i>Spam</i>" + s = f'<{starttag}>{content}</{endtag}>' + self._run_check(s, [("starttag", tag, []), + ('data', content), + ("endtag", tag)], + collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True)) + @support.subTests('tail,end', [ ('', False), ('<', False), diff --git a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst new file mode 100644 index 00000000000000..c30c9439a76a19 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst @@ -0,0 +1,3 @@ +Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe", +"noembed" and "noframes", and optionally RAWTEXT element "noscript" in +:class:`html.parser.HTMLParser`.