gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
e7741dd773
commit
76c0b01bc4
@ -278,7 +278,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||||||
if rawdata[i:i+4] == '<!--':
|
if rawdata[i:i+4] == '<!--':
|
||||||
# this case is actually already handled in goahead()
|
# this case is actually already handled in goahead()
|
||||||
return self.parse_comment(i)
|
return self.parse_comment(i)
|
||||||
elif rawdata[i:i+3] == '<![':
|
elif rawdata[i:i+9] == '<![CDATA[':
|
||||||
return self.parse_marked_section(i)
|
return self.parse_marked_section(i)
|
||||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||||
# find the closing >
|
# find the closing >
|
||||||
@ -295,7 +295,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||||||
def parse_bogus_comment(self, i, report=1):
|
def parse_bogus_comment(self, i, report=1):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
|
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
|
||||||
'parse_comment()')
|
'parse_bogus_comment()')
|
||||||
pos = rawdata.find('>', i+2)
|
pos = rawdata.find('>', i+2)
|
||||||
if pos == -1:
|
if pos == -1:
|
||||||
return -1
|
return -1
|
||||||
|
@ -566,12 +566,33 @@ text
|
|||||||
for html, expected in data:
|
for html, expected in data:
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_broken_comments(self):
|
def test_EOF_in_comments_or_decls(self):
|
||||||
|
data = [
|
||||||
|
('<!', [('data', '<!')]),
|
||||||
|
('<!-', [('data', '<!-')]),
|
||||||
|
('<!--', [('data', '<!--')]),
|
||||||
|
('<![', [('data', '<![')]),
|
||||||
|
('<![CDATA[', [('data', '<![CDATA[')]),
|
||||||
|
('<![CDATA[x', [('data', '<![CDATA[x')]),
|
||||||
|
('<!DOCTYPE', [('data', '<!DOCTYPE')]),
|
||||||
|
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
|
||||||
|
]
|
||||||
|
for html, expected in data:
|
||||||
|
self._run_check(html, expected)
|
||||||
|
def test_bogus_comments(self):
|
||||||
html = ('<! not really a comment >'
|
html = ('<! not really a comment >'
|
||||||
'<! not a comment either -->'
|
'<! not a comment either -->'
|
||||||
'<! -- close enough -->'
|
'<! -- close enough -->'
|
||||||
'<!><!<-- this was an empty comment>'
|
'<!><!<-- this was an empty comment>'
|
||||||
'<!!! another bogus comment !!!>')
|
'<!!! another bogus comment !!!>'
|
||||||
|
# see #32876
|
||||||
|
'<![with square brackets]!>'
|
||||||
|
'<![\nmultiline\nbogusness\n]!>'
|
||||||
|
'<![more brackets]-[and a hyphen]!>'
|
||||||
|
'<![cdata[should be uppercase]]>'
|
||||||
|
'<![CDATA [whitespaces are not ignored]]>'
|
||||||
|
'<![CDATA]]>' # required '[' after CDATA
|
||||||
|
)
|
||||||
expected = [
|
expected = [
|
||||||
('comment', ' not really a comment '),
|
('comment', ' not really a comment '),
|
||||||
('comment', ' not a comment either --'),
|
('comment', ' not a comment either --'),
|
||||||
@ -579,39 +600,65 @@ text
|
|||||||
('comment', ''),
|
('comment', ''),
|
||||||
('comment', '<-- this was an empty comment'),
|
('comment', '<-- this was an empty comment'),
|
||||||
('comment', '!! another bogus comment !!!'),
|
('comment', '!! another bogus comment !!!'),
|
||||||
|
('comment', '[with square brackets]!'),
|
||||||
|
('comment', '[\nmultiline\nbogusness\n]!'),
|
||||||
|
('comment', '[more brackets]-[and a hyphen]!'),
|
||||||
|
('comment', '[cdata[should be uppercase]]'),
|
||||||
|
('comment', '[CDATA [whitespaces are not ignored]]'),
|
||||||
|
('comment', '[CDATA]]'),
|
||||||
]
|
]
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_broken_condcoms(self):
|
def test_broken_condcoms(self):
|
||||||
# these condcoms are missing the '--' after '<!' and before the '>'
|
# these condcoms are missing the '--' after '<!' and before the '>'
|
||||||
|
# and they are considered bogus comments according to
|
||||||
|
# "8.2.4.42. Markup declaration open state"
|
||||||
html = ('<![if !(IE)]>broken condcom<![endif]>'
|
html = ('<![if !(IE)]>broken condcom<![endif]>'
|
||||||
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
|
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
|
||||||
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
|
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
|
||||||
'<![if !ie 6]><b>foo</b><![endif]>'
|
'<![if !ie 6]><b>foo</b><![endif]>'
|
||||||
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
|
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
|
||||||
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
|
|
||||||
# and "8.2.4.45 Markup declaration open state", comment tokens should
|
|
||||||
# be emitted instead of 'unknown decl', but calling unknown_decl
|
|
||||||
# provides more flexibility.
|
|
||||||
# See also Lib/_markupbase.py:parse_declaration
|
|
||||||
expected = [
|
expected = [
|
||||||
('unknown decl', 'if !(IE)'),
|
('comment', '[if !(IE)]'),
|
||||||
('data', 'broken condcom'),
|
('data', 'broken condcom'),
|
||||||
('unknown decl', 'endif'),
|
('comment', '[endif]'),
|
||||||
('unknown decl', 'if ! IE'),
|
('comment', '[if ! IE]'),
|
||||||
('startendtag', 'link', [('href', 'favicon.tiff')]),
|
('startendtag', 'link', [('href', 'favicon.tiff')]),
|
||||||
('unknown decl', 'endif'),
|
('comment', '[endif]'),
|
||||||
('unknown decl', 'if !IE 6'),
|
('comment', '[if !IE 6]'),
|
||||||
('startendtag', 'img', [('src', 'firefox.png')]),
|
('startendtag', 'img', [('src', 'firefox.png')]),
|
||||||
('unknown decl', 'endif'),
|
('comment', '[endif]'),
|
||||||
('unknown decl', 'if !ie 6'),
|
('comment', '[if !ie 6]'),
|
||||||
('starttag', 'b', []),
|
('starttag', 'b', []),
|
||||||
('data', 'foo'),
|
('data', 'foo'),
|
||||||
('endtag', 'b'),
|
('endtag', 'b'),
|
||||||
('unknown decl', 'endif'),
|
('comment', '[endif]'),
|
||||||
('unknown decl', 'if (!IE)|(lt IE 9)'),
|
('comment', '[if (!IE)|(lt IE 9)]'),
|
||||||
('startendtag', 'img', [('src', 'mammoth.bmp')]),
|
('startendtag', 'img', [('src', 'mammoth.bmp')]),
|
||||||
('unknown decl', 'endif')
|
('comment', '[endif]')
|
||||||
|
]
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
def test_cdata_declarations(self):
|
||||||
|
# More tests should be added. See also "8.2.4.42. Markup
|
||||||
|
# declaration open state", "8.2.4.69. CDATA section state",
|
||||||
|
# and issue 32876
|
||||||
|
html = ('<![CDATA[just some plain text]]>')
|
||||||
|
expected = [('unknown decl', 'CDATA[just some plain text')]
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
def test_cdata_declarations_multiline(self):
|
||||||
|
html = ('<code><![CDATA['
|
||||||
|
' if (a < b && a > b) {'
|
||||||
|
' printf("[<marquee>How?</marquee>]");'
|
||||||
|
' }'
|
||||||
|
']]></code>')
|
||||||
|
expected = [
|
||||||
|
('starttag', 'code', []),
|
||||||
|
('unknown decl',
|
||||||
|
'CDATA[ if (a < b && a > b) { '
|
||||||
|
'printf("[<marquee>How?</marquee>]"); }'),
|
||||||
|
('endtag', 'code')
|
||||||
]
|
]
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
Fix handling of invalid markup declarations in
|
||||||
|
:class:`html.parser.HTMLParser`.
|
Loading…
x
Reference in New Issue
Block a user