gh-94823: Improve coverage in tokenizer.c:valid_utf8 (GH-94856)

When loading a source file from disk, there is a separate UTF-8 validator distinct from the one in `unicode_decode_utf8`. This exercises that code path with the same set of invalid inputs as we use for testing the "other" UTF-8 decoder.
2022-08-16 11:54:13 -04:00 · 2022-08-16 11:54:13 -04:00 · f215d7cac9
commit f215d7cac9
parent 9d515997f9
1 changed files with 61 additions and 0 deletions
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@ -224,6 +224,67 @@ class AbstractSourceEncodingTest:
        out = self.check_script_output(src, br"'\n\n\n'")
 class UTF8ValidatorTest(unittest.TestCase):
    @unittest.skipIf(not sys.platform.startswith("linux"),
                     "Too slow to run on non-Linux platforms")
    def test_invalid_utf8(self):
        # This is a port of test_utf8_decode_invalid_sequences in
        # test_unicode.py to exercise the separate utf8 validator in
        # Parser/tokenizer.c used when reading source files.
        # That file is written using low-level C file I/O, so the only way to
        # test it is to write actual files to disk.
        # Each example is put inside a string at the top of the file so
        # it's an otherwise valid Python source file.
        template = b'"%s"\n'
        with tempfile.TemporaryDirectory() as tmpd:
            fn = os.path.join(tmpd, 'test.py')
            def check(content):
                with open(fn, 'wb') as fp:
                    fp.write(template % content)
                script_helper.assert_python_failure(fn)
            # continuation bytes in a sequence of 2, 3, or 4 bytes
            continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
            # start bytes of a 2-byte sequence equivalent to code points < 0x7F
            invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
            # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
            invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
            invalid_start_bytes = (
                continuation_bytes + invalid_2B_seq_start_bytes +
                invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
            )
            for byte in invalid_start_bytes:
                check(byte)
            for sb in invalid_2B_seq_start_bytes:
                for cb in continuation_bytes:
                    check(sb + cb)
            for sb in invalid_4B_seq_start_bytes:
                for cb1 in continuation_bytes[:3]:
                    for cb3 in continuation_bytes[:3]:
                        check(sb+cb1+b'\x80'+cb3)
            for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
                check(b'\xE0'+cb+b'\x80')
                check(b'\xE0'+cb+b'\xBF')
                # surrogates
            for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
                check(b'\xED'+cb+b'\x80')
                check(b'\xED'+cb+b'\xBF')
            for cb in [bytes([x]) for x in range(0x80, 0x90)]:
                check(b'\xF0'+cb+b'\x80\x80')
                check(b'\xF0'+cb+b'\xBF\xBF')
            for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
                check(b'\xF4'+cb+b'\x80\x80')
                check(b'\xF4'+cb+b'\xBF\xBF')
 class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
    def check_script_output(self, src, expected):