gh-82052: Don't send partial UTF-8 sequences to the Windows API (GH-101103)
Don't send partial UTF-8 sequences to the Windows API
This commit is contained in:
parent
c5660ae96f
commit
f34176b77f
@ -0,0 +1 @@
|
|||||||
|
Fixed an issue where writing more than 32K of Unicode output to the console screen in one go can result in mojibake.
|
@ -954,7 +954,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
|
|||||||
{
|
{
|
||||||
BOOL res = TRUE;
|
BOOL res = TRUE;
|
||||||
wchar_t *wbuf;
|
wchar_t *wbuf;
|
||||||
DWORD len, wlen, n = 0;
|
DWORD len, wlen, orig_len, n = 0;
|
||||||
HANDLE handle;
|
HANDLE handle;
|
||||||
|
|
||||||
if (self->fd == -1)
|
if (self->fd == -1)
|
||||||
@ -984,6 +984,21 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
|
|||||||
have to reduce and recalculate. */
|
have to reduce and recalculate. */
|
||||||
while (wlen > 32766 / sizeof(wchar_t)) {
|
while (wlen > 32766 / sizeof(wchar_t)) {
|
||||||
len /= 2;
|
len /= 2;
|
||||||
|
orig_len = len;
|
||||||
|
/* Reduce the length until we hit the final byte of a UTF-8 sequence
|
||||||
|
* (top bit is unset). Fix for github issue 82052.
|
||||||
|
*/
|
||||||
|
while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
|
||||||
|
--len;
|
||||||
|
/* If we hit a length of 0, something has gone wrong. This shouldn't
|
||||||
|
* be possible, as valid UTF-8 can have at most 3 non-final bytes
|
||||||
|
* before a final one, and our buffer is way longer than that.
|
||||||
|
* But to be on the safe side, if we hit this issue we just restore
|
||||||
|
* the original length and let the console API sort it out.
|
||||||
|
*/
|
||||||
|
if (len == 0) {
|
||||||
|
len = orig_len;
|
||||||
|
}
|
||||||
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
|
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
|
||||||
}
|
}
|
||||||
Py_END_ALLOW_THREADS
|
Py_END_ALLOW_THREADS
|
||||||
|
Loading…
x
Reference in New Issue
Block a user