This commit is contained in:
Brett Cannon 2013-04-09 17:03:27 -04:00
commit 2a4df127c7
4 changed files with 162 additions and 50 deletions

View File

@ -301,7 +301,6 @@ decoding_table = (
'\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
'\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
'\x9f' # 0xFF -> CONTROL '\x9f' # 0xFF -> CONTROL
'\ufffe' ## Widen to UCS2 for optimization
) )
### Encoding table ### Encoding table

View File

@ -301,7 +301,6 @@ decoding_table = (
'\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
'\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
'\x9f' # 0xFF -> CONTROL '\x9f' # 0xFF -> CONTROL
'\ufffe' ## Widen to UCS2 for optimization
) )
### Encoding table ### Encoding table

View File

@ -301,7 +301,6 @@ decoding_table = (
'\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE '\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE
'\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic) '\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic)
'\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS '\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS
'\ufffe' ## Widen to UCS2 for optimization
) )
### Encoding table ### Encoding table

View File

@ -468,7 +468,9 @@ unicode_result_ready(PyObject *unicode)
} }
if (length == 1) { if (length == 1) {
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); void *data = PyUnicode_DATA(unicode);
int kind = PyUnicode_KIND(unicode);
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
if (ch < 256) { if (ch < 256) {
PyObject *latin1_char = unicode_latin1[ch]; PyObject *latin1_char = unicode_latin1[ch];
if (latin1_char != NULL) { if (latin1_char != NULL) {
@ -541,7 +543,6 @@ static OSVERSIONINFOEX winver;
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#define BLOOM_LINEBREAK(ch) \ #define BLOOM_LINEBREAK(ch) \
@ -551,21 +552,40 @@ static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Py_LOCAL_INLINE(BLOOM_MASK) Py_LOCAL_INLINE(BLOOM_MASK)
make_bloom_mask(int kind, void* ptr, Py_ssize_t len) make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
{ {
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
do { \
TYPE *data = (TYPE *)PTR; \
TYPE *end = data + LEN; \
Py_UCS4 ch; \
for (; data != end; data++) { \
ch = *data; \
MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
} \
break; \
} while (0)
/* calculate simple bloom-style bitmask for a given unicode string */ /* calculate simple bloom-style bitmask for a given unicode string */
BLOOM_MASK mask; BLOOM_MASK mask;
Py_ssize_t i;
mask = 0; mask = 0;
for (i = 0; i < len; i++) switch (kind) {
BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); case PyUnicode_1BYTE_KIND:
BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
return mask; break;
case PyUnicode_2BYTE_KIND:
BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
break;
case PyUnicode_4BYTE_KIND:
BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
break;
default:
assert(0);
} }
return mask;
#define BLOOM_MEMBER(mask, chr, str) \ #undef BLOOM_UPDATE
(BLOOM(mask, chr) \ }
&& (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
/* Compilation of templated routines */ /* Compilation of templated routines */
@ -1938,13 +1958,17 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
assert(size > 0); assert(size > 0);
if (size == 1) { if (size == 1) {
Py_UCS4 ch = u[0]; Py_UCS4 ch = u[0];
int kind;
void *data;
if (ch < 256) if (ch < 256)
return get_latin1_char((unsigned char)ch); return get_latin1_char((unsigned char)ch);
res = PyUnicode_New(1, ch); res = PyUnicode_New(1, ch);
if (res == NULL) if (res == NULL)
return NULL; return NULL;
PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); kind = PyUnicode_KIND(res);
data = PyUnicode_DATA(res);
PyUnicode_WRITE(kind, data, 0, ch);
assert(_PyUnicode_CheckConsistency(res, 1)); assert(_PyUnicode_CheckConsistency(res, 1));
return res; return res;
} }
@ -1974,13 +1998,17 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
assert(size > 0); assert(size > 0);
if (size == 1) { if (size == 1) {
Py_UCS4 ch = u[0]; Py_UCS4 ch = u[0];
int kind;
void *data;
if (ch < 256) if (ch < 256)
return get_latin1_char((unsigned char)ch); return get_latin1_char((unsigned char)ch);
res = PyUnicode_New(1, ch); res = PyUnicode_New(1, ch);
if (res == NULL) if (res == NULL)
return NULL; return NULL;
PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); kind = PyUnicode_KIND(res);
data = PyUnicode_DATA(res);
PyUnicode_WRITE(kind, data, 0, ch);
assert(_PyUnicode_CheckConsistency(res, 1)); assert(_PyUnicode_CheckConsistency(res, 1));
return res; return res;
} }
@ -2786,6 +2814,9 @@ PyObject *
PyUnicode_FromOrdinal(int ordinal) PyUnicode_FromOrdinal(int ordinal)
{ {
PyObject *v; PyObject *v;
void *data;
int kind;
if (ordinal < 0 || ordinal > MAX_UNICODE) { if (ordinal < 0 || ordinal > MAX_UNICODE) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"chr() arg not in range(0x110000)"); "chr() arg not in range(0x110000)");
@ -2798,7 +2829,9 @@ PyUnicode_FromOrdinal(int ordinal)
v = PyUnicode_New(1, ordinal); v = PyUnicode_New(1, ordinal);
if (v == NULL) if (v == NULL)
return NULL; return NULL;
PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); kind = PyUnicode_KIND(v);
data = PyUnicode_DATA(v);
PyUnicode_WRITE(kind, data, 0, ordinal);
assert(_PyUnicode_CheckConsistency(v, 1)); assert(_PyUnicode_CheckConsistency(v, 1));
return v; return v;
} }
@ -3840,6 +3873,9 @@ PyUnicode_GetLength(PyObject *unicode)
Py_UCS4 Py_UCS4
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
{ {
void *data;
int kind;
if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
PyErr_BadArgument(); PyErr_BadArgument();
return (Py_UCS4)-1; return (Py_UCS4)-1;
@ -3848,7 +3884,9 @@ PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
PyErr_SetString(PyExc_IndexError, "string index out of range"); PyErr_SetString(PyExc_IndexError, "string index out of range");
return (Py_UCS4)-1; return (Py_UCS4)-1;
} }
return PyUnicode_READ_CHAR(unicode, index); data = PyUnicode_DATA(unicode);
kind = PyUnicode_KIND(unicode);
return PyUnicode_READ(kind, data, index);
} }
int int
@ -7281,6 +7319,7 @@ PyUnicode_DecodeCharmap(const char *s,
enum PyUnicode_Kind mapkind; enum PyUnicode_Kind mapkind;
void *mapdata; void *mapdata;
Py_UCS4 x; Py_UCS4 x;
unsigned char ch;
if (PyUnicode_READY(mapping) == -1) if (PyUnicode_READY(mapping) == -1)
return NULL; return NULL;
@ -7288,31 +7327,57 @@ PyUnicode_DecodeCharmap(const char *s,
maplen = PyUnicode_GET_LENGTH(mapping); maplen = PyUnicode_GET_LENGTH(mapping);
mapdata = PyUnicode_DATA(mapping); mapdata = PyUnicode_DATA(mapping);
mapkind = PyUnicode_KIND(mapping); mapkind = PyUnicode_KIND(mapping);
if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
/* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
* is disabled in encoding aliases, latin1 is preferred because
* its implementation is faster. */
Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Py_UCS4 maxchar = writer.maxchar;
assert (writer.kind == PyUnicode_1BYTE_KIND);
while (s < e) {
ch = *s;
x = mapdata_ucs1[ch];
if (x > maxchar) {
if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
goto onError;
maxchar = writer.maxchar;
outdata = (Py_UCS1 *)writer.data;
}
outdata[writer.pos] = x;
writer.pos++;
++s;
}
}
while (s < e) { while (s < e) {
unsigned char ch;
if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
enum PyUnicode_Kind outkind = writer.kind; enum PyUnicode_Kind outkind = writer.kind;
void *outdata = writer.data; Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
if (outkind == PyUnicode_1BYTE_KIND) { if (outkind == PyUnicode_1BYTE_KIND) {
Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
Py_UCS4 maxchar = writer.maxchar; Py_UCS4 maxchar = writer.maxchar;
while (s < e) { while (s < e) {
unsigned char ch = *s; ch = *s;
x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); x = mapdata_ucs2[ch];
if (x > maxchar) if (x > maxchar)
goto Error; goto Error;
PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x); outdata[writer.pos] = x;
writer.pos++; writer.pos++;
++s; ++s;
} }
break; break;
} }
else if (outkind == PyUnicode_2BYTE_KIND) { else if (outkind == PyUnicode_2BYTE_KIND) {
Py_UCS2 *outdata = (Py_UCS2 *)writer.data;
while (s < e) { while (s < e) {
unsigned char ch = *s; ch = *s;
x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); x = mapdata_ucs2[ch];
if (x == 0xFFFE) if (x == 0xFFFE)
goto Error; goto Error;
PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x); outdata[writer.pos] = x;
writer.pos++; writer.pos++;
++s; ++s;
} }
@ -7959,10 +8024,14 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
* -1=not initialized, 0=unknown, 1=strict, 2=replace, * -1=not initialized, 0=unknown, 1=strict, 2=replace,
* 3=ignore, 4=xmlcharrefreplace */ * 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1; int known_errorHandler = -1;
void *data;
int kind;
if (PyUnicode_READY(unicode) == -1) if (PyUnicode_READY(unicode) == -1)
return NULL; return NULL;
size = PyUnicode_GET_LENGTH(unicode); size = PyUnicode_GET_LENGTH(unicode);
data = PyUnicode_DATA(unicode);
kind = PyUnicode_KIND(unicode);
/* Default to Latin-1 */ /* Default to Latin-1 */
if (mapping == NULL) if (mapping == NULL)
@ -7977,7 +8046,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
return res; return res;
while (inpos<size) { while (inpos<size) {
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
/* try to encode it */ /* try to encode it */
charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
if (x==enc_EXCEPTION) /* error */ if (x==enc_EXCEPTION) /* error */
@ -9903,20 +9972,21 @@ replace(PyObject *self, PyObject *str1,
Py_UCS4 u1, u2; Py_UCS4 u1, u2;
int rkind; int rkind;
Py_ssize_t index, pos; Py_ssize_t index, pos;
char *src; char *src, *rbuf;
u1 = PyUnicode_READ_CHAR(str1, 0); u1 = PyUnicode_READ(kind1, buf1, 0);
pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
if (pos < 0) if (pos < 0)
goto nothing; goto nothing;
u2 = PyUnicode_READ_CHAR(str2, 0); u2 = PyUnicode_READ(kind2, buf2, 0);
u = PyUnicode_New(slen, maxchar); u = PyUnicode_New(slen, maxchar);
if (!u) if (!u)
goto error; goto error;
_PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
rkind = PyUnicode_KIND(u); rkind = PyUnicode_KIND(u);
rbuf = PyUnicode_DATA(u);
PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); PyUnicode_WRITE(rkind, rbuf, pos, u2);
index = 0; index = 0;
src = sbuf; src = sbuf;
while (--maxcount) while (--maxcount)
@ -9928,7 +9998,7 @@ replace(PyObject *self, PyObject *str1,
pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
if (pos < 0) if (pos < 0)
break; break;
PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); PyUnicode_WRITE(rkind, rbuf, index + pos, u2);
} }
} }
else { else {
@ -11570,6 +11640,7 @@ _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
int kind; int kind;
Py_ssize_t i, j, len; Py_ssize_t i, j, len;
BLOOM_MASK sepmask; BLOOM_MASK sepmask;
Py_ssize_t seplen;
if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
return NULL; return NULL;
@ -11577,24 +11648,35 @@ _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
kind = PyUnicode_KIND(self); kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self); data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self); len = PyUnicode_GET_LENGTH(self);
seplen = PyUnicode_GET_LENGTH(sepobj);
sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
PyUnicode_DATA(sepobj), PyUnicode_DATA(sepobj),
PyUnicode_GET_LENGTH(sepobj)); seplen);
i = 0; i = 0;
if (striptype != RIGHTSTRIP) { if (striptype != RIGHTSTRIP) {
while (i < len && while (i < len) {
BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
i++; i++;
} }
} }
j = len; j = len;
if (striptype != LEFTSTRIP) { if (striptype != LEFTSTRIP) {
do {
j--; j--;
} while (j >= i && while (j >= i) {
BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
j--;
}
j++; j++;
} }
@ -11641,31 +11723,64 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
static PyObject * static PyObject *
do_strip(PyObject *self, int striptype) do_strip(PyObject *self, int striptype)
{ {
int kind;
void *data;
Py_ssize_t len, i, j; Py_ssize_t len, i, j;
if (PyUnicode_READY(self) == -1) if (PyUnicode_READY(self) == -1)
return NULL; return NULL;
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self); len = PyUnicode_GET_LENGTH(self);
if (PyUnicode_IS_ASCII(self)) {
Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
i = 0; i = 0;
if (striptype != RIGHTSTRIP) { if (striptype != RIGHTSTRIP) {
while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { while (i < len) {
Py_UCS4 ch = data[i];
if (!_Py_ascii_whitespace[ch])
break;
i++; i++;
} }
} }
j = len; j = len;
if (striptype != LEFTSTRIP) { if (striptype != LEFTSTRIP) {
do {
j--; j--;
} while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); while (j >= i) {
Py_UCS4 ch = data[j];
if (!_Py_ascii_whitespace[ch])
break;
j--;
}
j++; j++;
} }
}
else {
int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
i = 0;
if (striptype != RIGHTSTRIP) {
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!Py_UNICODE_ISSPACE(ch))
break;
i++;
}
}
j = len;
if (striptype != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!Py_UNICODE_ISSPACE(ch))
break;
j--;
}
j++;
}
}
return PyUnicode_Substring(self, i, j); return PyUnicode_Substring(self, i, j);
} }