bpo-32583: Fix possible crashing in builtin Unicode decoders (#5325)

When using customized decode error handlers, it is possible for builtin decoders to write out-of-bounds and then crash.
2018-01-31 20:48:05 +08:00 · 2018-01-31 20:48:05 +08:00 · 2c7fd46e11
commit 2c7fd46e11
parent 84521047e4
3 changed files with 74 additions and 2 deletions
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@ -1044,6 +1044,58 @@ class CodecCallbackTest(unittest.TestCase):
            for (encoding, data) in baddata:
                self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
    # issue32583
    def test_crashing_decode_handler(self):
        # better generating one more character to fill the extra space slot
        # so in debug build it can steadily fail
        def forward_shorter_than_end(exc):
            if isinstance(exc, UnicodeDecodeError):
                # size one character, 0 < forward < exc.end
                return ('\ufffd', exc.start+1)
            else:
                raise TypeError("don't know how to handle %r" % exc)
        codecs.register_error(
            "test.forward_shorter_than_end", forward_shorter_than_end)
        self.assertEqual(
            b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
                'utf-16-le', 'test.forward_shorter_than_end'),
            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
        )
        self.assertEqual(
            b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
                'utf-16-be', 'test.forward_shorter_than_end'),
            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
        )
        self.assertEqual(
            b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
                'utf-32-le', 'test.forward_shorter_than_end'),
            '\ufffd\ufffd\ufffd\u1111\x00'
        )
        self.assertEqual(
            b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
                'utf-32-be', 'test.forward_shorter_than_end'),
            '\ufffd\ufffd\ufffd\u1111\x00'
        )
        def replace_with_long(exc):
            if isinstance(exc, UnicodeDecodeError):
                exc.object = b"\x00" * 8
                return ('\ufffd', exc.start)
            else:
                raise TypeError("don't know how to handle %r" % exc)
        codecs.register_error("test.replace_with_long", replace_with_long)
        self.assertEqual(
            b'\x00'.decode('utf-16', 'test.replace_with_long'),
            '\ufffd\x00\x00\x00\x00'
        )
        self.assertEqual(
            b'\x00'.decode('utf-32', 'test.replace_with_long'),
            '\ufffd\x00\x00'
        )
    def test_fake_error_class(self):
        handlers = [
            codecs.strict_errors,
--- a/Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst
+++ b/Builtins/2018-01-26-21-20-21.bpo-32583.Fh3fau.rst
@ -0,0 +1,2 @@
 Fix possible crashing in builtin Unicode decoders caused by write
 out-of-bound errors when using customized decode error handlers.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4190,7 +4190,10 @@ unicode_decode_call_errorhandler_writer(
    Py_ssize_t insize;
    Py_ssize_t newpos;
    Py_ssize_t replen;
    Py_ssize_t remain;
    PyObject *inputobj = NULL;
    int need_to_grow = 0;
    const char *new_inptr;
    if (*errorHandler == NULL) {
        *errorHandler = PyCodec_LookupError(errors);
@ -4221,6 +4224,7 @@ unicode_decode_call_errorhandler_writer(
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    if (!inputobj)
        goto onError;
    remain = *inend - *input - *endinpos;
    *input = PyBytes_AS_STRING(inputobj);
    insize = PyBytes_GET_SIZE(inputobj);
    *inend = *input + insize;
@ -4238,6 +4242,19 @@ unicode_decode_call_errorhandler_writer(
    replen = PyUnicode_GET_LENGTH(repunicode);
    if (replen > 1) {
        writer->min_length += replen - 1;
        need_to_grow = 1;
    }
    new_inptr = *input + newpos;
    if (*inend - new_inptr > remain) {
        /* We don't know the decoding algorithm here so we make the worst
           assumption that one byte decodes to one unicode character.
           If unfortunately one byte could decode to more unicode characters,
           the decoder may write out-of-bound then.  Is it possible for the
           algorithms using this function? */
        writer->min_length += *inend - new_inptr - remain;
        need_to_grow = 1;
    }
    if (need_to_grow) {
        writer->overallocate = 1;
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
@ -4247,7 +4264,7 @@ unicode_decode_call_errorhandler_writer(
        goto onError;
    *endinpos = newpos;
-    *inptr = *input + newpos;
+    *inptr = new_inptr;
    /* we made it! */
    Py_DECREF(restuple);
@ -5572,7 +5589,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
 #endif
    /* Note: size will always be longer than the resulting Unicode
-       character count */
+       character count normally.  Error handler will take care of
       resizing when needed. */
    _PyUnicodeWriter_Init(&writer);
    writer.min_length = (e - q + 1) / 2;
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
		`@ -0,0 +1,2 @@`
							`Fix possible crashing in builtin Unicode decoders caused by write`
							`out-of-bound errors when using customized decode error handlers.`