Skip to content

Commit 8de88e0

Browse files
miss-islingtontimonviolaserhiy-storchakaambv
authored
[3.13] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-136985)
(cherry picked from commit 4d02f31) Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Łukasz Langa <lukasz@langa.pl>
1 parent 4999cdb commit 8de88e0

File tree

3 files changed

+113
-5
lines changed

3 files changed

+113
-5
lines changed

Lib/html/parser.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class HTMLParser(_markupbase.ParserBase):
128128
"""
129129

130130
CDATA_CONTENT_ELEMENTS = ("script", "style")
131+
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
131132

132133
def __init__(self, *, convert_charrefs=True):
133134
"""Initialize and reset this instance.
@@ -145,6 +146,7 @@ def reset(self):
145146
self.lasttag = '???'
146147
self.interesting = interesting_normal
147148
self.cdata_elem = None
149+
self._escapable = True
148150
super().reset()
149151

150152
def feed(self, data):
@@ -166,14 +168,20 @@ def get_starttag_text(self):
166168
"""Return full source of start tag: '<...>'."""
167169
return self.__starttag_text
168170

169-
def set_cdata_mode(self, elem):
171+
def set_cdata_mode(self, elem, *, escapable=False):
170172
self.cdata_elem = elem.lower()
171-
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
172-
re.IGNORECASE|re.ASCII)
173+
self._escapable = escapable
174+
if escapable and not self.convert_charrefs:
175+
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
176+
re.IGNORECASE|re.ASCII)
177+
else:
178+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
179+
re.IGNORECASE|re.ASCII)
173180

174181
def clear_cdata_mode(self):
175182
self.interesting = interesting_normal
176183
self.cdata_elem = None
184+
self._escapable = True
177185

178186
# Internal -- handle data as far as reasonable. May leave state
179187
# and data to be processed by a subsequent call. If 'end' is
@@ -206,7 +214,7 @@ def goahead(self, end):
206214
break
207215
j = n
208216
if i < j:
209-
if self.convert_charrefs and not self.cdata_elem:
217+
if self.convert_charrefs and self._escapable:
210218
self.handle_data(unescape(rawdata[i:j]))
211219
else:
212220
self.handle_data(rawdata[i:j])
@@ -308,7 +316,7 @@ def goahead(self, end):
308316
assert 0, "interesting.search() lied"
309317
# end while
310318
if end and i < n:
311-
if self.convert_charrefs and not self.cdata_elem:
319+
if self.convert_charrefs and self._escapable:
312320
self.handle_data(unescape(rawdata[i:n]))
313321
else:
314322
self.handle_data(rawdata[i:n])
@@ -420,6 +428,8 @@ def parse_starttag(self, i):
420428
self.handle_starttag(tag, attrs)
421429
if tag in self.CDATA_CONTENT_ELEMENTS:
422430
self.set_cdata_mode(tag)
431+
elif tag in self.RCDATA_CONTENT_ELEMENTS:
432+
self.set_cdata_mode(tag, escapable=True)
423433
return endpos
424434

425435
# Internal -- check to see if we have a complete starttag; return end

Lib/test/test_htmlparser.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,49 @@ def test_style_content(self, content):
317317
("data", content),
318318
("endtag", "style")])
319319

320+
@support.subTests('content', [
321+
'<!-- not a comment -->',
322+
"<not a='start tag'>",
323+
'<![CDATA[not a cdata]]>',
324+
'<!not a bogus comment>',
325+
'</not a bogus comment>',
326+
'\u2603',
327+
'< /title>',
328+
'</ title>',
329+
'</titled>',
330+
'</title\v>',
331+
'</title\xa0>',
332+
'</tıtle>',
333+
])
334+
def test_title_content(self, content):
335+
source = f"<title>{content}</title>"
336+
self._run_check(source, [
337+
("starttag", "title", []),
338+
("data", content),
339+
("endtag", "title"),
340+
])
341+
342+
@support.subTests('content', [
343+
'<!-- not a comment -->',
344+
"<not a='start tag'>",
345+
'<![CDATA[not a cdata]]>',
346+
'<!not a bogus comment>',
347+
'</not a bogus comment>',
348+
'\u2603',
349+
'< /textarea>',
350+
'</ textarea>',
351+
'</textareable>',
352+
'</textarea\v>',
353+
'</textarea\xa0>',
354+
])
355+
def test_textarea_content(self, content):
356+
source = f"<textarea>{content}</textarea>"
357+
self._run_check(source, [
358+
("starttag", "textarea", []),
359+
("data", content),
360+
("endtag", "textarea"),
361+
])
362+
320363
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
321364
'script/', 'script foo=bar', 'script foo=">"'])
322365
def test_script_closing_tag(self, endtag):
@@ -346,6 +389,38 @@ def test_style_closing_tag(self, endtag):
346389
("endtag", "style")],
347390
collector=EventCollectorNoNormalize(convert_charrefs=False))
348391

392+
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
393+
'title/', 'title foo=bar', 'title foo=">"'])
394+
def test_title_closing_tag(self, endtag):
395+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
396+
s = f'<TitLe>{content}</{endtag}>'
397+
self._run_check(s, [("starttag", "title", []),
398+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
399+
("endtag", "title")],
400+
collector=EventCollectorNoNormalize(convert_charrefs=True))
401+
self._run_check(s, [("starttag", "title", []),
402+
('data', '<!-- not a comment --><i>Egg '),
403+
('entityref', 'amp'),
404+
('data', ' Spam</i>'),
405+
("endtag", "title")],
406+
collector=EventCollectorNoNormalize(convert_charrefs=False))
407+
408+
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
409+
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
410+
def test_textarea_closing_tag(self, endtag):
411+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
412+
s = f'<TexTarEa>{content}</{endtag}>'
413+
self._run_check(s, [("starttag", "textarea", []),
414+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
415+
("endtag", "textarea")],
416+
collector=EventCollectorNoNormalize(convert_charrefs=True))
417+
self._run_check(s, [("starttag", "textarea", []),
418+
('data', '<!-- not a comment --><i>Egg '),
419+
('entityref', 'amp'),
420+
('data', ' Spam</i>'),
421+
("endtag", "textarea")],
422+
collector=EventCollectorNoNormalize(convert_charrefs=False))
423+
349424
@support.subTests('tail,end', [
350425
('', False),
351426
('<', False),
@@ -363,6 +438,27 @@ def test_eof_in_script(self, tail, end):
363438
("data", content if end else content + tail)],
364439
collector=EventCollectorNoNormalize(convert_charrefs=False))
365440

441+
@support.subTests('tail,end', [
442+
('', False),
443+
('<', False),
444+
('</', False),
445+
('</t', False),
446+
('</title', False),
447+
('</title ', True),
448+
('</title foo=bar', True),
449+
('</title foo=">', True),
450+
])
451+
def test_eof_in_title(self, tail, end):
452+
s = f'<TitLe>Egg &amp; Spam{tail}'
453+
self._run_check(s, [("starttag", "title", []),
454+
("data", "Egg & Spam" + ('' if end else tail))],
455+
collector=EventCollectorNoNormalize(convert_charrefs=True))
456+
self._run_check(s, [("starttag", "title", []),
457+
('data', 'Egg '),
458+
('entityref', 'amp'),
459+
('data', ' Spam' + ('' if end else tail))],
460+
collector=EventCollectorNoNormalize(convert_charrefs=False))
461+
366462
def test_comments(self):
367463
html = ("<!-- I'm a valid comment -->"
368464
'<!--me too!-->'
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix support of escapable raw text mode (elements "textarea" and "title")
2+
in :class:`html.parser.HTMLParser`.

0 commit comments

Comments
 (0)