gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-05-10 16:31:43 +02:00 · 2025-05-10 16:31:43 +02:00 · 76c0b01bc4
commit 76c0b01bc4
parent e7741dd773
3 changed files with 68 additions and 19 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -278,7 +278,7 @@ class HTMLParser(_markupbase.ParserBase):
        if rawdata[i:i+4] == '<!--':
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
-        elif rawdata[i:i+3] == '<![':
+        elif rawdata[i:i+9] == '<![CDATA[':
            return self.parse_marked_section(i)
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
@ -295,7 +295,7 @@ class HTMLParser(_markupbase.ParserBase):
    def parse_bogus_comment(self, i, report=1):
        rawdata = self.rawdata
        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
-                                                'parse_comment()')
+                                                'parse_bogus_comment()')
        pos = rawdata.find('>', i+2)
        if pos == -1:
            return -1
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -566,12 +566,33 @@ text
        for html, expected in data:
            self._run_check(html, expected)
-    def test_broken_comments(self):
+    def test_EOF_in_comments_or_decls(self):
        data = [
            ('<!', [('data', '<!')]),
            ('<!-', [('data', '<!-')]),
            ('<!--', [('data', '<!--')]),
            ('<![', [('data', '<![')]),
            ('<![CDATA[', [('data', '<![CDATA[')]),
            ('<![CDATA[x', [('data', '<![CDATA[x')]),
            ('<!DOCTYPE', [('data', '<!DOCTYPE')]),
            ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
        ]
        for html, expected in data:
            self._run_check(html, expected)
    def test_bogus_comments(self):
        html = ('<! not really a comment >'
                '<! not a comment either -->'
                '<! -- close enough -->'
                '<!><!<-- this was an empty comment>'
-                '<!!! another bogus comment !!!>')
+                '<!!! another bogus comment !!!>'
                # see #32876
                '<![with square brackets]!>'
                '<![\nmultiline\nbogusness\n]!>'
                '<![more brackets]-[and a hyphen]!>'
                '<![cdata[should be uppercase]]>'
                '<![CDATA [whitespaces are not ignored]]>'
                '<![CDATA]]>'  # required '[' after CDATA
        )
        expected = [
            ('comment', ' not really a comment '),
            ('comment', ' not a comment either --'),
@ -579,39 +600,65 @@ text
            ('comment', ''),
            ('comment', '<-- this was an empty comment'),
            ('comment', '!! another bogus comment !!!'),
            ('comment', '[with square brackets]!'),
            ('comment', '[\nmultiline\nbogusness\n]!'),
            ('comment', '[more brackets]-[and a hyphen]!'),
            ('comment', '[cdata[should be uppercase]]'),
            ('comment', '[CDATA [whitespaces are not ignored]]'),
            ('comment', '[CDATA]]'),
        ]
        self._run_check(html, expected)
    def test_broken_condcoms(self):
        # these condcoms are missing the '--' after '<!' and before the '>'
        # and they are considered bogus comments according to
        # "8.2.4.42. Markup declaration open state"
        html = ('<![if !(IE)]>broken condcom<![endif]>'
                '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
                '<![if !IE 6]><img src="firefox.png" /><![endif]>'
                '<![if !ie 6]><b>foo</b><![endif]>'
                '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
        # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
        # and "8.2.4.45 Markup declaration open state", comment tokens should
        # be emitted instead of 'unknown decl', but calling unknown_decl
        # provides more flexibility.
        # See also Lib/_markupbase.py:parse_declaration
        expected = [
-            ('unknown decl', 'if !(IE)'),
+            ('comment', '[if !(IE)]'),
            ('data', 'broken condcom'),
-            ('unknown decl', 'endif'),
+            ('comment', '[endif]'),
-            ('unknown decl', 'if ! IE'),
+            ('comment', '[if ! IE]'),
            ('startendtag', 'link', [('href', 'favicon.tiff')]),
-            ('unknown decl', 'endif'),
+            ('comment', '[endif]'),
-            ('unknown decl', 'if !IE 6'),
+            ('comment', '[if !IE 6]'),
            ('startendtag', 'img', [('src', 'firefox.png')]),
-            ('unknown decl', 'endif'),
+            ('comment', '[endif]'),
-            ('unknown decl', 'if !ie 6'),
+            ('comment', '[if !ie 6]'),
            ('starttag', 'b', []),
            ('data', 'foo'),
            ('endtag', 'b'),
-            ('unknown decl', 'endif'),
+            ('comment', '[endif]'),
-            ('unknown decl', 'if (!IE)|(lt IE 9)'),
+            ('comment', '[if (!IE)|(lt IE 9)]'),
            ('startendtag', 'img', [('src', 'mammoth.bmp')]),
-            ('unknown decl', 'endif')
+            ('comment', '[endif]')
        ]
        self._run_check(html, expected)
    def test_cdata_declarations(self):
        # More tests should be added. See also "8.2.4.42. Markup
        # declaration open state", "8.2.4.69. CDATA section state",
        # and issue 32876
        html = ('<![CDATA[just some plain text]]>')
        expected = [('unknown decl', 'CDATA[just some plain text')]
        self._run_check(html, expected)
    def test_cdata_declarations_multiline(self):
        html = ('<code><![CDATA['
                '    if (a < b && a > b) {'
                '        printf("[<marquee>How?</marquee>]");'
                '    }'
                ']]></code>')
        expected = [
            ('starttag', 'code', []),
            ('unknown decl',
             'CDATA[    if (a < b && a > b) {        '
             'printf("[<marquee>How?</marquee>]");    }'),
            ('endtag', 'code')
        ]
        self._run_check(html, expected)
--- a/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst
+++ b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst
@ -0,0 +1,2 @@
 Fix handling of invalid markup declarations in
 :class:`html.parser.HTMLParser`.
		`@ -0,0 +1,2 @@`
							`Fix handling of invalid markup declarations in`
							:class:`html.parser.HTMLParser`.