From 420af549bfc2c876333b0d16fc8cee595df51440 Mon Sep 17 00:00:00 2001
From: Timon Viola <44016238+timonviola@users.noreply.github.com>
Date: Sun, 14 Jul 2024 15:58:31 +0200
Subject: [PATCH 1/8] fix: add escapable raw text mode to html parsel
---
Lib/html/parser.py | 25 +++++++++++++++--
Lib/test/test_htmlparser.py | 54 ++++++++++++++++++++++++++++++++++++-
2 files changed, 76 insertions(+), 3 deletions(-)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 13c95c34e505c8..df0c365e74126b 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -26,6 +26,7 @@
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
+escapable_raw_text_close = re.compile('(title|textarea)>', re.I)
commentclose = re.compile(r'--\s*>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
@@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase):
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
+ ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea")
def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
@@ -99,6 +101,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
+ self.escapable_raw_text_elem = None
super().reset()
def feed(self, data):
@@ -120,6 +123,14 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
+ def set_escapable_raw_text_mode(self, elem):
+ self.escapable_raw_text_elem = elem.lower()
+ self.interesting = re.compile(r'\s*%s\s*>' % self.escapable_raw_text_elem, re.I)
+
+ def clear_escapable_raw_text_mode(self):
+ self.interesting = interesting_normal
+ self.escapable_raw_text_elem = None
+
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
@@ -136,7 +147,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
@@ -155,11 +166,13 @@ def goahead(self, end):
if match:
j = match.start()
else:
+ if self.escapable_raw_text_elem:
+ break
if self.cdata_elem:
break
j = n
if i < j:
- if self.convert_charrefs and not self.cdata_elem:
+ if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
@@ -336,6 +349,8 @@ def parse_starttag(self, i):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
+ if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS:
+ self.set_escapable_raw_text_mode(tag)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
@@ -411,8 +426,14 @@ def parse_endtag(self, i):
self.handle_data(rawdata[i:gtpos])
return gtpos
+ if self.escapable_raw_text_elem is not None: # title or textarea
+ if elem != self.escapable_raw_text_elem:
+ self.handle_data(rawdata[i:gtpos])
+ return gtpos
+
self.handle_endtag(elem)
self.clear_cdata_mode()
+ self.clear_escapable_raw_text_mode()
return gtpos
# Overridable -- finish processing of start+end tag:
', + 'foo = "";', + 'foo = "";', + 'foo = <\n/title> ', + '', + '\n//\n', + 'foo = "";', + '', + # these two should be invalid according to the HTML 5 spec, + # section 8.1.2.2 + #'foo = \nscript>', + #'foo = script>', + ] + elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea'] + for content in contents: + for element in elements: + element_lower = element.lower() + s = '<{element}>{content}{element}>'.format(element=element, + content=content) + self._run_check(s, [("starttag", element_lower, []), + ("data", content), + ("endtag", element_lower)]) + + def test_escapable_raw_text_with_closing_tags(self): + # see issue #13358 + # make sure that HTMLParser calls handle_data only once for each CDATA. + # The normal event collector normalizes the events in get_events, + # so we override it to return the original list of events. + class Collector(EventCollector): + def get_events(self): + return self.events + + content = """ ¬-an-entity-ref; +
+ ''""" + for element in [' script', 'script ', ' script ', + '\nscript', 'script\n', '\nscript\n']: + element_lower = element.lower().strip() + s = ''""" - for element in [' script', 'script ', ' script ', - '\nscript', 'script\n', '\nscript\n']: - element_lower = element.lower().strip() - s = '{1}' - '{1}'.format(text, charref), + '{1}' + '
@@ -407,15 +377,9 @@ def test_convert_charrefs(self): ('starttag', 'script', []), ('data', text), ('endtag', 'script'), ('data', '"'), ('starttag', 'style', []), ('data', text), - ('endtag', 'style'), ('data', '"'), - ('starttag', 'title', []), ('data', text), - ('endtag', 'title'), ('data', '"'), - ('starttag', 'textarea', []), ('data', text), - ('endtag', 'textarea'), ('data', '"')] + ('endtag', 'style'), ('data', '"')] self._run_check('{1}{1}' - '{1}' - '