Skip to content

gh-118350: Fix support of elements "textarea" and "title" in HTMLParser #135310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 22, 2025
Merged
Prev Previous commit
Next Next commit
Refactoring.
  • Loading branch information
serhiy-storchaka committed Jul 14, 2025
commit 18c6ea80b2387103eff3ced2fb11e1baff8aba84
20 changes: 11 additions & 9 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ class HTMLParser(_markupbase.ParserBase):
containing respectively the named or numeric reference as the
argument.
"""
# For escapable raw text elements (textarea and title), CDATA mode is reused
CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")
CDATA_CONTENT_ELEMENTS = ("script", "style")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All @@ -145,7 +145,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._raw_escapable = False
self._escapable = True
super().reset()

def feed(self, data):
Expand All @@ -167,10 +167,10 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text

def set_cdata_mode(self, elem):
def set_cdata_mode(self, elem, escapable=False):
self.cdata_elem = elem.lower()
self._raw_escapable = self.cdata_elem in ("textarea", "title")
if self._raw_escapable and not self.convert_charrefs:
self._escapable = escapable
if escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
Expand All @@ -180,7 +180,7 @@ def set_cdata_mode(self, elem):
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
self._raw_escapable = False
self._escapable = True

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
Expand Down Expand Up @@ -213,7 +213,7 @@ def goahead(self, end):
break
j = n
if i < j:
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand Down Expand Up @@ -315,7 +315,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
Expand Down Expand Up @@ -427,6 +427,8 @@ def parse_starttag(self, i):
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, True)
return endpos

# Internal -- check to see if we have a complete starttag; return end
Expand Down