Skip to content

gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser #137837

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Doc/library/html.parser.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.

.. class:: HTMLParser(*, convert_charrefs=True)
.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)

Create a parser instance able to parse invalid markup.

If *convert_charrefs* is ``True`` (the default), all character
references (except the ones in ``script``/``style`` elements) are
automatically converted to the corresponding Unicode characters.

If *scripting* is true, the ``noscript`` element is parsed in the
RAWTEXT mode.

An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
Expand All @@ -37,6 +40,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.

.. versionchanged:: 3.13.8
Added the *scripting* parameter.


Example HTML Parser Application
-------------------------------
Expand Down
17 changes: 14 additions & 3 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,23 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
# See the HTML5 specs section "13.4 Parsing HTML fragments".
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

def __init__(self, *, convert_charrefs=True):
def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.

If convert_charrefs is True (the default), all character references
If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.

If scripting is true, the noscript element is parsed in the
RAWTEXT mode.
"""
super().__init__()
self.convert_charrefs = convert_charrefs
self.scripting = scripting
self.reset()

def reset(self):
Expand Down Expand Up @@ -454,6 +460,11 @@ def parse_starttag(self, i):
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
elif self.scripting and tag == "noscript":
self.set_cdata_mode(tag)
elif tag == "plaintext":
self.set_cdata_mode(tag)
self.interesting = re.compile(r'\z')
return endpos

# Internal -- check to see if we have a complete starttag; return end
Expand Down
169 changes: 136 additions & 33 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,49 +324,138 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])

@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /title>',
'</ title>',
'</titled>',
'</title\v>',
'</title\xa0>',
'</tıtle>',
@support.subTests('tag', ['title', 'textarea'])
def test_rcdata_content(self, tag):
content = (
'<!-- not a comment -->'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
f'< /{tag}>'
f'</ {tag}>'
f'</{tag}x>'
f'</{tag}\v>'
f'</{tag}\xa0>'
)
source = f"<{tag}>{content}</{tag}>"
self._run_check(source, [
("starttag", tag, []),
("data", content),
("endtag", tag),
])
def test_title_content(self, content):
source = f"<title>{content}</title>"
source = f"<{tag}>&amp;</{tag}>"
self._run_check(source, [
("starttag", "title", []),
("starttag", tag, []),
('entityref', 'amp'),
("endtag", tag),
])

@support.subTests('tag',
['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
def test_rawtext_content(self, tag):
content = (
'<!-- not a comment -->'
'&not-an-entity-ref;'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
f'< /{tag}>'
f'</ {tag}>'
f'</{tag}x>'
f'</{tag}\v>'
f'</{tag}\xa0>'
)
source = f"<{tag}>{content}</{tag}>"
self._run_check(source, [
("starttag", tag, []),
("data", content),
("endtag", "title"),
("endtag", tag),
])

@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /textarea>',
'</ textarea>',
'</textareable>',
'</textarea\v>',
'</textarea\xa0>',
def test_noscript_content(self):
content = (
'<!-- not a comment -->'
'&not-an-entity-ref;'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
f'< /noscript>'
f'</ noscript>'
f'</noscriptx>'
f'</noscript\v>'
f'</noscript\xa0>'
)
source = f"<noscript>{content}</noscript>"
self._run_check(source, [
('starttag', 'noscript', []),
('comment', ' not a comment '),
('entityref', 'not'),
('data', '-an-entity-ref;'),
('starttag', 'not', [('a', 'start tag')]),
('unknown decl', 'CDATA[not a cdata'),
('comment', 'not a bogus comment'),
('endtag', 'not'),
('data', '☃< /noscript>'),
('comment', ' noscript'),
('endtag', 'noscriptx'),
('endtag', 'noscript\x0b'),
('endtag', 'noscript\xa0'),
('endtag', 'noscript')
])
def test_textarea_content(self, content):
source = f"<textarea>{content}</textarea>"
self._run_check(source, [
("starttag", "textarea", []),
("starttag", "noscript", []),
("data", content),
("endtag", "noscript"),
], collector=EventCollector(convert_charrefs=False, scripting=True))

def test_plaintext_content(self):
content = (
'<!-- not a comment -->'
'&not-an-entity-ref;'
"<not a='start tag'>"
'<![CDATA[not a cdata]]>'
'<!not a bogus comment>'
'</not a bogus comment>'
'\u2603'
'</plaintext>'
)
source = f"<plaintext>{content}"
self._run_check(source, [
("starttag", "plaintext", []),
("data", content),
("endtag", "textarea"),
])

@support.subTests('tag,endtag', [
('title', 'tıtle'),
('style', 'ſtyle'),
('style', 'ſtyle'),
('style', 'style'),
('iframe', 'ıframe'),
('noframes', 'noframeſ'),
('noscript', 'noſcript'),
('noscript', 'noscrıpt'),
('script', 'ſcript'),
('script', 'scrıpt'),
])
def test_invalid_nonascii_closing_tag(self, tag, endtag):
source = f"<{tag}><a></{endtag}>"
self._run_check(source, [
("starttag", tag, []),
("data", f"<a></{endtag}>"),
], collector=EventCollector(convert_charrefs=False, scripting=True))
source = f"<{tag}><a></{endtag}></{tag}>"
self._run_check(source, [
("starttag", tag, []),
("data", f"<a></{endtag}>"),
("endtag", tag),
], collector=EventCollector(convert_charrefs=False, scripting=True))

@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
'script/', 'script foo=bar', 'script foo=">"'])
def test_script_closing_tag(self, endtag):
Expand Down Expand Up @@ -428,6 +517,20 @@ def test_textarea_closing_tag(self, endtag):
("endtag", "textarea")],
collector=EventCollectorNoNormalize(convert_charrefs=False))

@support.subTests('starttag', ['TitLe', 'TexTarEa', 'StyLE', 'XmP',
'iFraMe', 'noEmBed', 'noFraMes', 'noScrIPt',
'ScrIPt'])
def test_closing_tag(self, starttag):
tag = starttag.lower()
for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
content = "<!-- not a comment --><i>Spam</i>"
s = f'<{starttag}>{content}</{endtag}>'
self._run_check(s, [("starttag", tag, []),
('data', content),
("endtag", tag)],
collector=EventCollectorNoNormalize(convert_charrefs=False, scripting=True))

@support.subTests('tail,end', [
('', False),
('<', False),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
:class:`html.parser.HTMLParser`.
Loading