gh-86155: Fix data loss after unclosed script or style tag in HTMLParser (GH-22658)

When calling .close() the HTMLParser should flush all remaining content,
even when that content is in an unclosed script or style tag.
This commit is contained in:
Waylan Limberg 2025-05-10 13:36:06 -04:00 committed by GitHub
parent 7dddb4e667
commit 53383e90e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 1 deletions

View File

@ -260,7 +260,7 @@ class HTMLParser(_markupbase.ParserBase):
else: else:
assert 0, "interesting.search() lied" assert 0, "interesting.search() lied"
# end while # end while
if end and i < n and not self.cdata_elem: if end and i < n:
if self.convert_charrefs and not self.cdata_elem: if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:n])) self.handle_data(unescape(rawdata[i:n]))
else: else:

View File

@ -317,6 +317,16 @@ text
("endtag", element_lower)], ("endtag", element_lower)],
collector=Collector(convert_charrefs=False)) collector=Collector(convert_charrefs=False))
def test_EOF_in_cdata(self):
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
s = f'<script>{content}'
self._run_check(s, [
("starttag", 'script', []),
("data", content)
])
def test_comments(self): def test_comments(self):
html = ("<!-- I'm a valid comment -->" html = ("<!-- I'm a valid comment -->"
'<!--me too!-->' '<!--me too!-->'

View File

@ -0,0 +1,2 @@
:meth:`html.parser.HTMLParser.close` no longer loses data when the
``<script>`` tag is not closed. Patch by Waylan Limberg.