PyUnicode_FromWideChar() and PyUnicode_FromUnicode() raise a ValueError if a
character in not in range [U+0000; U+10ffff].
This commit is contained in:
parent
bc9f0c68f5
commit
8faf8216e4
@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
|
||||||
|
#define MAX_UNICODE 0x10ffff
|
||||||
|
|
||||||
#ifdef Py_DEBUG
|
#ifdef Py_DEBUG
|
||||||
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
|
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
|
||||||
#else
|
#else
|
||||||
@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
assert(maxchar >= 0x10000);
|
assert(maxchar >= 0x10000);
|
||||||
/* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
|
assert(maxchar <= MAX_UNICODE);
|
||||||
return characters outside the range U+0000-U+10FFFF. */
|
|
||||||
/* assert(maxchar <= 0x10FFFF); */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
|
|||||||
Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
|
Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
|
||||||
{
|
{
|
||||||
const wchar_t *iter;
|
const wchar_t *iter;
|
||||||
|
Py_UCS4 ch;
|
||||||
|
|
||||||
assert(num_surrogates != NULL && maxchar != NULL);
|
assert(num_surrogates != NULL && maxchar != NULL);
|
||||||
*num_surrogates = 0;
|
*num_surrogates = 0;
|
||||||
*maxchar = 0;
|
*maxchar = 0;
|
||||||
|
|
||||||
for (iter = begin; iter < end; ) {
|
for (iter = begin; iter < end; ) {
|
||||||
if (*iter > *maxchar) {
|
|
||||||
*maxchar = *iter;
|
|
||||||
#if SIZEOF_WCHAR_T != 2
|
|
||||||
if (*maxchar >= 0x10000)
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#if SIZEOF_WCHAR_T == 2
|
#if SIZEOF_WCHAR_T == 2
|
||||||
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
|
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
|
||||||
&& (iter+1) < end
|
&& (iter+1) < end
|
||||||
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
|
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
|
||||||
{
|
{
|
||||||
Py_UCS4 surrogate_val;
|
ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
|
||||||
surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
|
|
||||||
++(*num_surrogates);
|
++(*num_surrogates);
|
||||||
if (surrogate_val > *maxchar)
|
|
||||||
*maxchar = surrogate_val;
|
|
||||||
iter += 2;
|
iter += 2;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
iter++;
|
|
||||||
#else
|
|
||||||
iter++;
|
|
||||||
#endif
|
#endif
|
||||||
|
{
|
||||||
|
ch = *iter;
|
||||||
|
iter++;
|
||||||
|
}
|
||||||
|
if (ch > *maxchar) {
|
||||||
|
*maxchar = ch;
|
||||||
|
if (*maxchar > MAX_UNICODE) {
|
||||||
|
PyErr_Format(PyExc_ValueError,
|
||||||
|
"character U+%x is not in range [U+0000; U+10ffff]",
|
||||||
|
ch);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
|
|||||||
&maxchar, &num_surrogates) == -1)
|
&maxchar, &num_surrogates) == -1)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
unicode = PyUnicode_New(size - num_surrogates,
|
unicode = PyUnicode_New(size - num_surrogates, maxchar);
|
||||||
maxchar);
|
|
||||||
if (!unicode)
|
if (!unicode)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind)
|
|||||||
return 0x10000;
|
return 0x10000;
|
||||||
default:
|
default:
|
||||||
assert(0 && "invalid kind");
|
assert(0 && "invalid kind");
|
||||||
return 0x10ffff;
|
return MAX_UNICODE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2796,7 +2797,7 @@ PyObject *
|
|||||||
PyUnicode_FromOrdinal(int ordinal)
|
PyUnicode_FromOrdinal(int ordinal)
|
||||||
{
|
{
|
||||||
PyObject *v;
|
PyObject *v;
|
||||||
if (ordinal < 0 || ordinal > 0x10ffff) {
|
if (ordinal < 0 || ordinal > MAX_UNICODE) {
|
||||||
PyErr_SetString(PyExc_ValueError,
|
PyErr_SetString(PyExc_ValueError,
|
||||||
"chr() arg not in range(0x110000)");
|
"chr() arg not in range(0x110000)");
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
|
|||||||
four_bytes = PyUnicode_4BYTE_DATA(unicode);
|
four_bytes = PyUnicode_4BYTE_DATA(unicode);
|
||||||
for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
|
for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
|
||||||
if (*four_bytes > 0xFFFF) {
|
if (*four_bytes > 0xFFFF) {
|
||||||
assert(*four_bytes <= 0x10FFFF);
|
assert(*four_bytes <= MAX_UNICODE);
|
||||||
/* encode surrogate pair in this case */
|
/* encode surrogate pair in this case */
|
||||||
*w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
|
*w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
|
||||||
*w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
|
*w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
|
||||||
@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str,
|
|||||||
continue;
|
continue;
|
||||||
encode_char:
|
encode_char:
|
||||||
if (ch >= 0x10000) {
|
if (ch >= 0x10000) {
|
||||||
assert(ch <= 0x10FFFF);
|
assert(ch <= MAX_UNICODE);
|
||||||
|
|
||||||
/* code first surrogate */
|
/* code first surrogate */
|
||||||
base64bits += 16;
|
base64bits += 16;
|
||||||
@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
|
|||||||
}
|
}
|
||||||
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||||
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||||
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
|
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
|
||||||
|
|
||||||
WRITE_MAYBE_FAIL(i++, ch);
|
WRITE_MAYBE_FAIL(i++, ch);
|
||||||
break;
|
break;
|
||||||
@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
|||||||
}
|
}
|
||||||
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||||
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||||
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
|
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
|
||||||
|
|
||||||
#if SIZEOF_WCHAR_T == 4
|
#if SIZEOF_WCHAR_T == 4
|
||||||
*p++ = (wchar_t)ch;
|
*p++ = (wchar_t)ch;
|
||||||
@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
|
|||||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
} else /* ch >= 0x10000 */ {
|
} else /* ch >= 0x10000 */ {
|
||||||
assert(ch <= 0x10FFFF);
|
assert(ch <= MAX_UNICODE);
|
||||||
/* Encode UCS4 Unicode ordinals */
|
/* Encode UCS4 Unicode ordinals */
|
||||||
*p++ = (char)(0xf0 | (ch >> 18));
|
*p++ = (char)(0xf0 | (ch >> 18));
|
||||||
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
||||||
@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
|
|||||||
break;
|
break;
|
||||||
store:
|
store:
|
||||||
/* when we get here, chr is a 32-bit unicode character */
|
/* when we get here, chr is a 32-bit unicode character */
|
||||||
if (chr <= 0x10ffff) {
|
if (chr <= MAX_UNICODE) {
|
||||||
WRITECHAR(chr);
|
WRITECHAR(chr);
|
||||||
} else {
|
} else {
|
||||||
endinpos = s-starts;
|
endinpos = s-starts;
|
||||||
@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
|||||||
|
|
||||||
/* Map 21-bit characters to '\U00xxxxxx' */
|
/* Map 21-bit characters to '\U00xxxxxx' */
|
||||||
else if (ch >= 0x10000) {
|
else if (ch >= 0x10000) {
|
||||||
assert(ch <= 0x10FFFF);
|
assert(ch <= MAX_UNICODE);
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'U';
|
*p++ = 'U';
|
||||||
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
|
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
|
||||||
@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
|||||||
else
|
else
|
||||||
x += 10 + c - 'A';
|
x += 10 + c - 'A';
|
||||||
}
|
}
|
||||||
if (x <= 0x10ffff) {
|
if (x <= MAX_UNICODE) {
|
||||||
if (unicode_putchar(&v, &outpos, x) < 0)
|
if (unicode_putchar(&v, &outpos, x) < 0)
|
||||||
goto onError;
|
goto onError;
|
||||||
} else {
|
} else {
|
||||||
@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
|||||||
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
|
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
|
||||||
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
||||||
if (ch >= 0x10000) {
|
if (ch >= 0x10000) {
|
||||||
assert(ch <= 0x10FFFF);
|
assert(ch <= MAX_UNICODE);
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'U';
|
*p++ = 'U';
|
||||||
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
|
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
|
||||||
@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode,
|
|||||||
else if (ch < 1000000)
|
else if (ch < 1000000)
|
||||||
repsize += 2+6+1;
|
repsize += 2+6+1;
|
||||||
else {
|
else {
|
||||||
assert(ch <= 0x10FFFF);
|
assert(ch <= MAX_UNICODE);
|
||||||
repsize += 2+7+1;
|
repsize += 2+7+1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -9275,7 +9276,7 @@ fixup(PyObject *self,
|
|||||||
else if (maxchar_new <= 65535)
|
else if (maxchar_new <= 65535)
|
||||||
maxchar_new = 65535;
|
maxchar_new = 65535;
|
||||||
else
|
else
|
||||||
maxchar_new = 1114111; /* 0x10ffff */
|
maxchar_new = MAX_UNICODE;
|
||||||
|
|
||||||
if (!maxchar_new && PyUnicode_CheckExact(self)) {
|
if (!maxchar_new && PyUnicode_CheckExact(self)) {
|
||||||
/* fixfct should return TRUE if it modified the buffer. If
|
/* fixfct should return TRUE if it modified the buffer. If
|
||||||
@ -13059,7 +13060,7 @@ formatchar(PyObject *v)
|
|||||||
if (x == -1 && PyErr_Occurred())
|
if (x == -1 && PyErr_Occurred())
|
||||||
goto onError;
|
goto onError;
|
||||||
|
|
||||||
if (x < 0 || x > 0x10ffff) {
|
if (x < 0 || x > MAX_UNICODE) {
|
||||||
PyErr_SetString(PyExc_OverflowError,
|
PyErr_SetString(PyExc_OverflowError,
|
||||||
"%c arg not in range(0x110000)");
|
"%c arg not in range(0x110000)");
|
||||||
return (Py_UCS4) -1;
|
return (Py_UCS4) -1;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user