From 420af549bfc2c876333b0d16fc8cee595df51440 Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Sun, 14 Jul 2024 15:58:31 +0200 Subject: [PATCH 1/8] fix: add escapable raw text mode to html parsel --- Lib/html/parser.py | 25 +++++++++++++++-- Lib/test/test_htmlparser.py | 54 ++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 13c95c34e505c8..df0c365e74126b 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -26,6 +26,7 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') +escapable_raw_text_close = re.compile('', re.I) commentclose = re.compile(r'--\s*>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; @@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase): """ CDATA_CONTENT_ELEMENTS = ("script", "style") + ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. @@ -99,6 +101,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self.escapable_raw_text_elem = None super().reset() def feed(self, data): @@ -120,6 +123,14 @@ def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text + def set_escapable_raw_text_mode(self, elem): + self.escapable_raw_text_elem = elem.lower() + self.interesting = re.compile(r'' % self.escapable_raw_text_elem, re.I) + + def clear_escapable_raw_text_mode(self): + self.interesting = interesting_normal + self.escapable_raw_text_elem = None + def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'' % self.cdata_elem, re.I) @@ -136,7 +147,7 @@ def goahead(self, end): i = 0 n = len(rawdata) while i < n: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end @@ -155,11 +166,13 @@ def goahead(self, end): if match: j = match.start() else: + if self.escapable_raw_text_elem: + break if self.cdata_elem: break j = n if i < j: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) @@ -336,6 +349,8 @@ def parse_starttag(self, i): self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) + if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS: + self.set_escapable_raw_text_mode(tag) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos @@ -411,8 +426,14 @@ def parse_endtag(self, i): self.handle_data(rawdata[i:gtpos]) return gtpos + if self.escapable_raw_text_elem is not None: # title or textarea + if elem != self.escapable_raw_text_elem: + self.handle_data(rawdata[i:gtpos]) + return gtpos + self.handle_endtag(elem) self.clear_cdata_mode() + self.clear_escapable_raw_text_mode() return gtpos # Overridable -- finish processing of start+end tag: diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index b42a611c62c0aa..bf42489da7d162 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -285,7 +285,7 @@ def test_cdata_content(self): #'foo = ', #'foo = ', ] - elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] + elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea'] for content in contents: for element in elements: element_lower = element.lower() @@ -317,6 +317,58 @@ def get_events(self): ("endtag", element_lower)], collector=Collector(convert_charrefs=False)) + def test_escapable_raw_text_content(self): + contents = [ + '

This is a header

', + 'Rebelious

Heading' + ' ¬-an-entity-ref;', + "", + '

', + 'foo = "";', + 'foo = "";', + 'foo = <\n/title> ', + '', + '\n//\n', + 'foo = "";', + '', + # these two should be invalid according to the HTML 5 spec, + # section 8.1.2.2 + #'foo = ', + #'foo = ', + ] + elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea'] + for content in contents: + for element in elements: + element_lower = element.lower() + s = '<{element}>{content}'.format(element=element, + content=content) + self._run_check(s, [("starttag", element_lower, []), + ("data", content), + ("endtag", element_lower)]) + + def test_escapable_raw_text_with_closing_tags(self): + # see issue #13358 + # make sure that HTMLParser calls handle_data only once for each CDATA. + # The normal event collector normalizes the events in get_events, + # so we override it to return the original list of events. + class Collector(EventCollector): + def get_events(self): + return self.events + + content = """ ¬-an-entity-ref; +

+ ''""" + for element in [' script', 'script ', ' script ', + '\nscript', 'script\n', '\nscript\n']: + element_lower = element.lower().strip() + s = '{1}' - '{1}'.format(text, charref), + '{1}' + '{0}{1}' + '{1}'.format(text, charref), expected, collector=collector()) # check truncated charrefs at the end of the file html = '&quo &# &#x' From da868db7ced08fd3cf40a4e34f3021e1f191e872 Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Tue, 13 May 2025 22:00:03 +0200 Subject: [PATCH 5/8] test: include raw text and escapable raw text elements in cdata content test --- Lib/test/test_htmlparser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 11e7d7b0d1dba4..0578eeba4037bc 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -285,7 +285,9 @@ def test_cdata_content(self): #'foo = ', #'foo = ', ] - elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea'] + tags = ['script', 'style', 'textarea', 'title'] + # test the following 'casing' for each tag: script, SCRIPT, Script etc. + elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)] for content in contents: for element in elements: element_lower = element.lower() From a36070a641ff20f2a795476492ff0970dc9c2103 Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Wed, 14 May 2025 20:12:22 +0200 Subject: [PATCH 6/8] update to latest main --- Lib/html/parser.py | 27 +++-------------------- Lib/test/test_htmlparser.py | 44 ++++--------------------------------- 2 files changed, 7 insertions(+), 64 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 336524bbe4b47e..a893c3846d6859 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -28,7 +28,6 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') -escapable_raw_text_close = re.compile('', re.I) commentclose = re.compile(r'--\s*>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; @@ -101,7 +100,6 @@ class HTMLParser(_markupbase.ParserBase): """ CDATA_CONTENT_ELEMENTS = ("script", "style") - ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. @@ -119,7 +117,6 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None - self.escapable_raw_text_elem = None super().reset() def feed(self, data): @@ -141,14 +138,6 @@ def get_starttag_text(self): """Return full source of start tag: '<...>'.""" return self.__starttag_text - def set_escapable_raw_text_mode(self, elem): - self.escapable_raw_text_elem = elem.lower() - self.interesting = re.compile(r'' % self.escapable_raw_text_elem, re.I) - - def clear_escapable_raw_text_mode(self): - self.interesting = interesting_normal - self.escapable_raw_text_elem = None - def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() self.interesting = re.compile(r'' % self.cdata_elem, re.I) @@ -165,7 +154,7 @@ def goahead(self, end): i = 0 n = len(rawdata) while i < n: - if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: + if self.convert_charrefs and not self.cdata_elem: j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end @@ -184,13 +173,11 @@ def goahead(self, end): if match: j = match.start() else: - if self.escapable_raw_text_elem: - break if self.cdata_elem: break j = n if i < j: - if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: + if self.convert_charrefs and not self.cdata_elem: self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) @@ -367,8 +354,6 @@ def parse_starttag(self, i): self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) - if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS: - self.set_escapable_raw_text_mode(tag) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) return endpos @@ -444,14 +429,8 @@ def parse_endtag(self, i): self.handle_data(rawdata[i:gtpos]) return gtpos - if self.escapable_raw_text_elem is not None: # title or textarea - if elem != self.escapable_raw_text_elem: - self.handle_data(rawdata[i:gtpos]) - return gtpos - self.handle_endtag(elem) self.clear_cdata_mode() - self.clear_escapable_raw_text_mode() return gtpos # Overridable -- finish processing of start+end tag: @@ -492,4 +471,4 @@ def handle_pi(self, data): pass def unknown_decl(self, data): - pass + pass \ No newline at end of file diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index a8bb683ef86510..41f35282e53c58 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -285,9 +285,7 @@ def test_cdata_content(self): #'foo = ', #'foo = ', ] - tags = ['script', 'style', 'textarea', 'title'] - # test the following 'casing' for each tag: script, SCRIPT, Script etc. - elements = [f(tag) for tag in tags for f in (str.lower, str.upper, str.capitalize)] + elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] for content in contents: for element in elements: element_lower = element.lower() @@ -319,34 +317,6 @@ def get_events(self): ("endtag", element_lower)], collector=Collector(convert_charrefs=False)) - def test_escapable_raw_text_content(self): - contents = [ - 'foo = "";', - 'foo = <\n/title> ', - '', - '\n//\n', - # valid character reference - 'A', - # ambiguous ampersand example - '¬aref', - 'foo = "";', - '', - # these two should be invalid according to the HTML 5 spec, - # section 8.1.2.2 - #'foo = ', - #'foo = ', - ] - elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea'] - for content in contents: - for element in elements: - element_lower = element.lower() - s = '<{element}>{content}'.format(element=element, - content=content) - self._run_check(s, [("starttag", element_lower, []), - ("data", content), - ("endtag", element_lower)]) - def test_EOF_in_cdata(self): content = """ ¬-an-entity-ref;

@@ -407,15 +377,9 @@ def test_convert_charrefs(self): ('starttag', 'script', []), ('data', text), ('endtag', 'script'), ('data', '"'), ('starttag', 'style', []), ('data', text), - ('endtag', 'style'), ('data', '"'), - ('starttag', 'title', []), ('data', text), - ('endtag', 'title'), ('data', '"'), - ('starttag', 'textarea', []), ('data', text), - ('endtag', 'textarea'), ('data', '"')] + ('endtag', 'style'), ('data', '"')] self._run_check('{1}{1}' - '{1}' - '{0}{1}' - '{1}'.format(text, charref), + '{1}'.format(text, charref), expected, collector=collector()) # check truncated charrefs at the end of the file html = '&quo &# &#x' @@ -922,4 +886,4 @@ def test_base_class_methods_called(self, super_reset_method, super_init_method): if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file From 43804bb6b348676f4c26694076840825f4e8f713 Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Wed, 14 May 2025 20:15:40 +0200 Subject: [PATCH 7/8] test: add failing test --- Lib/test/test_htmlparser.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 41f35282e53c58..468322410d02ff 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -294,6 +294,29 @@ def test_cdata_content(self): self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)]) + def test_raw_text_content(self): + """See gh-issue #118350""" + content = """

tagshould be handled as text""" + elements = [ + "script", + "style", + "title", + "textarea", + "SCRIPT", + "STYLE", + "TITLE", + "TEXTAREA", + "Script", + "Style", + "Title", + "Textarea", + ] + for element in elements: + source = f"<{element}>{content}" + self._run_check(source, [ + ("starttag", element.lower(), []), + ("data", content) + ]) def test_cdata_with_closing_tags(self): # see issue #13358 From 70b8e5d6b7beea975c68ba5ef6c4138255b79779 Mon Sep 17 00:00:00 2001 From: Timon Viola <44016238+timonviola@users.noreply.github.com> Date: Wed, 14 May 2025 20:59:16 +0200 Subject: [PATCH 8/8] test: add charref test --- Lib/test/test_htmlparser.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 468322410d02ff..7347bbd9242e94 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -294,8 +294,9 @@ def test_cdata_content(self): self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)]) + def test_raw_text_content(self): - """See gh-issue #118350""" + # Tags should be treated as text in raw text and escapable raw text content. content = """

tagshould be handled as text""" elements = [ "script", @@ -318,6 +319,33 @@ def test_raw_text_content(self): ("data", content) ]) + def test_escapable_raw_text_content(self): + # Charrefs should be escaped in esacapable raw text content. + class Collector(EventCollector): + pass + + content = "Timon & Pumba" + expected = "Timon & Pumba" + elements = [ + "title", + "textarea", + "TITLE", + "TEXTAREA", + "Title", + "Textarea", + ] + for element in elements: + source = f"<{element}>{content}" + self._run_check( + source, [ + ("starttag", element.lower(), []), + ('data', 'Timon '), + ('entityref', 'amp'), + ('data', ' Pumba') + ], + collector=Collector(convert_charrefs=False), + ) + def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. @@ -496,7 +524,7 @@ def test_slashes_in_starttag(self): ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)]) ] self._run_check(html, expected) - #see issue #14538 + # see issue #14538 html = ('' '') expected = [ @@ -909,4 +937,4 @@ def test_base_class_methods_called(self, super_reset_method, super_init_method): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()