Make the unicode-escape and the UTF-16 codecs handle surrogates
correctly and thus roundtrip-safe. Some minor cleanups of the code. Added tests for the roundtrip-safety.
This commit is contained in:
parent
0d42e0c54a
commit
6c6bfb7c70
@ -445,11 +445,19 @@ verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
|
|||||||
verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
|
verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
|
||||||
verify(u'hello'.encode('latin-1') == 'hello')
|
verify(u'hello'.encode('latin-1') == 'hello')
|
||||||
|
|
||||||
|
# Roundtrip safety for BMP (just the first 1024 chars)
|
||||||
u = u''.join(map(unichr, range(1024)))
|
u = u''.join(map(unichr, range(1024)))
|
||||||
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
|
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
|
||||||
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
||||||
verify(unicode(u.encode(encoding),encoding) == u)
|
verify(unicode(u.encode(encoding),encoding) == u)
|
||||||
|
|
||||||
|
# Roundtrip safety for non-BMP (just a few chars)
|
||||||
|
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
|
||||||
|
for encoding in ('utf-8',
|
||||||
|
'utf-16', 'utf-16-le', 'utf-16-be',
|
||||||
|
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
|
||||||
|
verify(unicode(u.encode(encoding),encoding) == u)
|
||||||
|
|
||||||
u = u''.join(map(unichr, range(256)))
|
u = u''.join(map(unichr, range(256)))
|
||||||
for encoding in (
|
for encoding in (
|
||||||
'latin-1',
|
'latin-1',
|
||||||
|
@ -104,7 +104,7 @@ static PyUnicodeObject *unicode_latin1[256];
|
|||||||
static char unicode_default_encoding[100];
|
static char unicode_default_encoding[100];
|
||||||
|
|
||||||
Py_UNICODE
|
Py_UNICODE
|
||||||
PyUnicode_GetMax()
|
PyUnicode_GetMax(void)
|
||||||
{
|
{
|
||||||
#ifdef Py_UNICODE_WIDE
|
#ifdef Py_UNICODE_WIDE
|
||||||
return 0x10FFFF;
|
return 0x10FFFF;
|
||||||
@ -1081,17 +1081,12 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
|
|||||||
#endif
|
#endif
|
||||||
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
||||||
#ifndef Py_UNICODE_WIDE
|
#ifndef Py_UNICODE_WIDE
|
||||||
/* This is valid data (a UTF-16 surrogate pair), but
|
*p++ = ch;
|
||||||
we are not able to store this information since our
|
*p++ = ch2;
|
||||||
Py_UNICODE type only has 16 bits... this might
|
|
||||||
change someday, even though it's unlikely. */
|
|
||||||
errmsg = "code pairs are not supported";
|
|
||||||
goto utf16Error;
|
|
||||||
#else
|
#else
|
||||||
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
|
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
|
||||||
continue;
|
|
||||||
#endif
|
#endif
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
errmsg = "illegal UTF-16 surrogate";
|
errmsg = "illegal UTF-16 surrogate";
|
||||||
@ -1325,7 +1320,8 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
|||||||
/* UCS-2 character */
|
/* UCS-2 character */
|
||||||
*p++ = (Py_UNICODE) chr;
|
*p++ = (Py_UNICODE) chr;
|
||||||
else if (chr <= 0x10ffff) {
|
else if (chr <= 0x10ffff) {
|
||||||
/* UCS-4 character. Either store directly, or as surrogate pair. */
|
/* UCS-4 character. Either store directly, or as
|
||||||
|
surrogate pair. */
|
||||||
#ifdef Py_UNICODE_WIDE
|
#ifdef Py_UNICODE_WIDE
|
||||||
*p++ = chr;
|
*p++ = chr;
|
||||||
#else
|
#else
|
||||||
@ -1446,24 +1442,50 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
|
|||||||
else if (ch >= 0x10000) {
|
else if (ch >= 0x10000) {
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'U';
|
*p++ = 'U';
|
||||||
*p++ = hexdigit[(ch >> 28) & 0xf];
|
*p++ = hexdigit[(ch >> 28) & 0x0000000F];
|
||||||
*p++ = hexdigit[(ch >> 24) & 0xf];
|
*p++ = hexdigit[(ch >> 24) & 0x0000000F];
|
||||||
*p++ = hexdigit[(ch >> 20) & 0xf];
|
*p++ = hexdigit[(ch >> 20) & 0x0000000F];
|
||||||
*p++ = hexdigit[(ch >> 16) & 0xf];
|
*p++ = hexdigit[(ch >> 16) & 0x0000000F];
|
||||||
*p++ = hexdigit[(ch >> 12) & 0xf];
|
*p++ = hexdigit[(ch >> 12) & 0x0000000F];
|
||||||
*p++ = hexdigit[(ch >> 8) & 0xf];
|
*p++ = hexdigit[(ch >> 8) & 0x0000000F];
|
||||||
*p++ = hexdigit[(ch >> 4) & 0xf];
|
*p++ = hexdigit[(ch >> 4) & 0x0000000F];
|
||||||
*p++ = hexdigit[ch & 15];
|
*p++ = hexdigit[ch & 15];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
|
||||||
|
else if (ch >= 0xD800 && ch < 0xDC00) {
|
||||||
|
Py_UNICODE ch2;
|
||||||
|
Py_UCS4 ucs;
|
||||||
|
|
||||||
|
ch2 = *s++;
|
||||||
|
size--;
|
||||||
|
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
|
||||||
|
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'U';
|
||||||
|
*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
|
||||||
|
*p++ = hexdigit[ucs & 0x0000000F];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* Fall through: isolated surrogates are copied as-is */
|
||||||
|
s--;
|
||||||
|
size++;
|
||||||
|
}
|
||||||
|
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
else if (ch >= 256) {
|
if (ch >= 256) {
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'u';
|
*p++ = 'u';
|
||||||
*p++ = hexdigit[(ch >> 12) & 0xf];
|
*p++ = hexdigit[(ch >> 12) & 0x000F];
|
||||||
*p++ = hexdigit[(ch >> 8) & 0xf];
|
*p++ = hexdigit[(ch >> 8) & 0x000F];
|
||||||
*p++ = hexdigit[(ch >> 4) & 0xf];
|
*p++ = hexdigit[(ch >> 4) & 0x000F];
|
||||||
*p++ = hexdigit[ch & 15];
|
*p++ = hexdigit[ch & 0x000F];
|
||||||
}
|
}
|
||||||
/* Map special whitespace to '\t', \n', '\r' */
|
/* Map special whitespace to '\t', \n', '\r' */
|
||||||
else if (ch == '\t') {
|
else if (ch == '\t') {
|
||||||
@ -1482,8 +1504,8 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
|
|||||||
else if (ch < ' ' || ch >= 128) {
|
else if (ch < ' ' || ch >= 128) {
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'x';
|
*p++ = 'x';
|
||||||
*p++ = hexdigit[(ch >> 4) & 0xf];
|
*p++ = hexdigit[(ch >> 4) & 0x000F];
|
||||||
*p++ = hexdigit[ch & 15];
|
*p++ = hexdigit[ch & 0x000F];
|
||||||
}
|
}
|
||||||
/* Copy everything else as-is */
|
/* Copy everything else as-is */
|
||||||
else
|
else
|
||||||
|
Loading…
x
Reference in New Issue
Block a user