Change the treatment of positions returned by PEP293
error handers in the Unicode codecs: Negative positions are treated as being relative to the end of the input and out of bounds positions result in an IndexError. Also update the PEP and include an explanation of this in the documentation for codecs.register_error. Fixes a small bug in iconv_codecs: if the position from the callback is negative *add* it to the size instead of substracting it. From SF patch #677429.
This commit is contained in:
parent
f7f4517fae
commit
2e0b18af30
@ -103,11 +103,22 @@ Raises a \exception{LookupError} in case the encoding cannot be found.
|
|||||||
Register the error handling function \var{error_handler} under the
|
Register the error handling function \var{error_handler} under the
|
||||||
name \var{name}. \var{error_handler} will be called during encoding
|
name \var{name}. \var{error_handler} will be called during encoding
|
||||||
and decoding in case of an error, when \var{name} is specified as the
|
and decoding in case of an error, when \var{name} is specified as the
|
||||||
errors parameter. \var{error_handler} will be called with an
|
errors parameter.
|
||||||
\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or
|
|
||||||
\exception{UnicodeTranslateError} instance and must return a tuple
|
For encoding \var{error_handler} will be called with a
|
||||||
with a replacement for the unencodable/undecodable part of the input
|
\exception{UnicodeEncodeError} instance, which contains information about
|
||||||
and a position where encoding/decoding should continue.
|
the location of the error. The error handler must either raise this or
|
||||||
|
a different exception or return a tuple with a replacement for the
|
||||||
|
unencodable part of the input and a position where encoding should
|
||||||
|
continue. The encoder will encode the replacement and continue encoding
|
||||||
|
the original input at the specified position. Negative position values
|
||||||
|
will be treated as being relative to the end of the input string. If the
|
||||||
|
resulting position is out of bound an IndexError will be raised.
|
||||||
|
|
||||||
|
Decoding and translating works similar, except \exception{UnicodeDecodeError}
|
||||||
|
or \exception{UnicodeTranslateError} will be passed to the handler and
|
||||||
|
that the replacement from the error handler will be put into the output
|
||||||
|
directly.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
\begin{funcdesc}{lookup_error}{name}
|
\begin{funcdesc}{lookup_error}{name}
|
||||||
|
@ -572,7 +572,7 @@ class C:
|
|||||||
\var{classinfo} argument, or of a (direct or indirect) subclass
|
\var{classinfo} argument, or of a (direct or indirect) subclass
|
||||||
thereof. Also return true if \var{classinfo} is a type object and
|
thereof. Also return true if \var{classinfo} is a type object and
|
||||||
\var{object} is an object of that type. If \var{object} is not a
|
\var{object} is an object of that type. If \var{object} is not a
|
||||||
class instance or a object of the given type, the function always
|
class instance or an object of the given type, the function always
|
||||||
returns false. If \var{classinfo} is neither a class object nor a
|
returns false. If \var{classinfo} is neither a class object nor a
|
||||||
type object, it may be a tuple of class or type objects, or may
|
type object, it may be a tuple of class or type objects, or may
|
||||||
recursively contain other such tuples (other sequence types are not
|
recursively contain other such tuples (other sequence types are not
|
||||||
|
@ -1,6 +1,23 @@
|
|||||||
import test.test_support, unittest
|
import test.test_support, unittest
|
||||||
import sys, codecs, htmlentitydefs, unicodedata
|
import sys, codecs, htmlentitydefs, unicodedata
|
||||||
|
|
||||||
|
class PosReturn:
|
||||||
|
# this can be used for configurable callbacks
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.pos = 0
|
||||||
|
|
||||||
|
def handle(self, exc):
|
||||||
|
oldpos = self.pos
|
||||||
|
realpos = oldpos
|
||||||
|
if realpos<0:
|
||||||
|
realpos = len(exc.object) + realpos
|
||||||
|
# if we don't advance this time, terminate on the next call
|
||||||
|
# otherwise we'd get an endless loop
|
||||||
|
if realpos <= exc.start:
|
||||||
|
self.pos = len(exc.object)
|
||||||
|
return (u"<?>", oldpos)
|
||||||
|
|
||||||
class CodecCallbackTest(unittest.TestCase):
|
class CodecCallbackTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_xmlcharrefreplace(self):
|
def test_xmlcharrefreplace(self):
|
||||||
@ -543,18 +560,36 @@ class CodecCallbackTest(unittest.TestCase):
|
|||||||
codecs.register_error("test.baddecodereturn2", baddecodereturn2)
|
codecs.register_error("test.baddecodereturn2", baddecodereturn2)
|
||||||
self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
|
self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2")
|
||||||
|
|
||||||
pos = [-42]
|
handler = PosReturn()
|
||||||
def negposreturn(exc):
|
codecs.register_error("test.posreturn", handler.handle)
|
||||||
pos[0] += 1 # use list to work around scoping problem
|
|
||||||
return (u"?", pos[0])
|
|
||||||
codecs.register_error("test.negposreturn", negposreturn)
|
|
||||||
"\xff".decode("ascii", "test.negposreturn")
|
|
||||||
|
|
||||||
def hugeposreturn(exc):
|
# Valid negative position
|
||||||
return (u"?", 424242)
|
handler.pos = -1
|
||||||
codecs.register_error("test.hugeposreturn", hugeposreturn)
|
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
|
||||||
"\xff".decode("ascii", "test.hugeposreturn")
|
|
||||||
"\\uyyyy".decode("raw-unicode-escape", "test.hugeposreturn")
|
# Valid negative position
|
||||||
|
handler.pos = -2
|
||||||
|
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>")
|
||||||
|
|
||||||
|
# Negative position out of bounds
|
||||||
|
handler.pos = -3
|
||||||
|
self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
|
||||||
|
|
||||||
|
# Valid positive position
|
||||||
|
handler.pos = 1
|
||||||
|
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0")
|
||||||
|
|
||||||
|
# Largest valid positive position (one beyond end of input
|
||||||
|
handler.pos = 2
|
||||||
|
self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>")
|
||||||
|
|
||||||
|
# Invalid positive position
|
||||||
|
handler.pos = 3
|
||||||
|
self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn")
|
||||||
|
|
||||||
|
# Restart at the "0"
|
||||||
|
handler.pos = 6
|
||||||
|
self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0")
|
||||||
|
|
||||||
class D(dict):
|
class D(dict):
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
@ -579,22 +614,39 @@ class CodecCallbackTest(unittest.TestCase):
|
|||||||
codecs.register_error("test.badencodereturn2", badencodereturn2)
|
codecs.register_error("test.badencodereturn2", badencodereturn2)
|
||||||
self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
|
self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2")
|
||||||
|
|
||||||
pos = [-42]
|
handler = PosReturn()
|
||||||
def negposreturn(exc):
|
codecs.register_error("test.posreturn", handler.handle)
|
||||||
pos[0] += 1 # use list to work around scoping problem
|
|
||||||
return (u"?", pos[0])
|
|
||||||
codecs.register_error("test.negposreturn", negposreturn)
|
|
||||||
u"\xff".encode("ascii", "test.negposreturn")
|
|
||||||
|
|
||||||
def hugeposreturn(exc):
|
# Valid negative position
|
||||||
return (u"?", 424242)
|
handler.pos = -1
|
||||||
codecs.register_error("test.hugeposreturn", hugeposreturn)
|
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
|
||||||
u"\xff".encode("ascii", "test.hugeposreturn")
|
|
||||||
|
# Valid negative position
|
||||||
|
handler.pos = -2
|
||||||
|
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
|
||||||
|
|
||||||
|
# Negative position out of bounds
|
||||||
|
handler.pos = -3
|
||||||
|
self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
|
||||||
|
|
||||||
|
# Valid positive position
|
||||||
|
handler.pos = 1
|
||||||
|
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
|
||||||
|
|
||||||
|
# Largest valid positive position (one beyond end of input
|
||||||
|
handler.pos = 2
|
||||||
|
self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
|
||||||
|
|
||||||
|
# Invalid positive position
|
||||||
|
handler.pos = 3
|
||||||
|
self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
|
||||||
|
|
||||||
|
handler.pos = 0
|
||||||
|
|
||||||
class D(dict):
|
class D(dict):
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
raise ValueError
|
raise ValueError
|
||||||
for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.hugeposreturn"):
|
for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
|
||||||
self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
|
self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
|
||||||
self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
|
self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
|
||||||
self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
|
self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
|
||||||
|
@ -247,8 +247,13 @@ errorexit_cbpad: Py_XDECREF(retobj);
|
|||||||
Py_DECREF(retobj);
|
Py_DECREF(retobj);
|
||||||
|
|
||||||
if (newpos < 0)
|
if (newpos < 0)
|
||||||
newpos = inputlen - newpos;
|
newpos = inputlen + newpos;
|
||||||
if (newpos < 0 || newpos >= inputlen)
|
if (newpos < 0 || newpos > inputlen) {
|
||||||
|
PyErr_Format(PyExc_IndexError, "position %ld from error handler"
|
||||||
|
" out of bounds", newpos);
|
||||||
|
goto errorexit;
|
||||||
|
}
|
||||||
|
if (newpos == inputlen)
|
||||||
break;
|
break;
|
||||||
inp = inp_top + Py_UNICODE_SIZE * newpos;
|
inp = inp_top + Py_UNICODE_SIZE * newpos;
|
||||||
inplen = inplen_total - Py_UNICODE_SIZE * newpos;
|
inplen = inplen_total - Py_UNICODE_SIZE * newpos;
|
||||||
@ -471,8 +476,13 @@ errorexit_cbpad: Py_DECREF(retobj);
|
|||||||
Py_DECREF(retobj);
|
Py_DECREF(retobj);
|
||||||
|
|
||||||
if (newpos < 0)
|
if (newpos < 0)
|
||||||
newpos = inplen_total - newpos;
|
newpos = inplen_total + newpos;
|
||||||
if (newpos < 0 || newpos >= inplen_total)
|
if (newpos < 0 || newpos > inplen_total) {
|
||||||
|
PyErr_Format(PyExc_IndexError, "position %ld from error handler"
|
||||||
|
" out of bounds", newpos);
|
||||||
|
goto errorexit;
|
||||||
|
}
|
||||||
|
if (newpos == inplen_total)
|
||||||
break;
|
break;
|
||||||
inp = inp_top + newpos;
|
inp = inp_top + newpos;
|
||||||
inplen = inplen_total - newpos;
|
inplen = inplen_total - newpos;
|
||||||
|
@ -728,9 +728,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
|
|||||||
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
|
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
|
||||||
goto onError;
|
goto onError;
|
||||||
if (newpos<0)
|
if (newpos<0)
|
||||||
newpos = 0;
|
newpos = insize+newpos;
|
||||||
else if (newpos>insize)
|
if (newpos<0 || newpos>insize) {
|
||||||
newpos = insize;
|
PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
|
||||||
/* need more space? (at least enough for what we
|
/* need more space? (at least enough for what we
|
||||||
have+the replacement+the rest of the string (starting
|
have+the replacement+the rest of the string (starting
|
||||||
@ -2246,9 +2248,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (*newpos<0)
|
if (*newpos<0)
|
||||||
*newpos = 0;
|
*newpos = size+*newpos;
|
||||||
else if (*newpos>size)
|
if (*newpos<0 || *newpos>size) {
|
||||||
*newpos = size;
|
PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
|
||||||
|
Py_DECREF(restuple);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
Py_INCREF(resunicode);
|
Py_INCREF(resunicode);
|
||||||
Py_DECREF(restuple);
|
Py_DECREF(restuple);
|
||||||
return resunicode;
|
return resunicode;
|
||||||
@ -3084,9 +3089,12 @@ static PyObject *unicode_translate_call_errorhandler(const char *errors,
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
if (*newpos<0)
|
if (*newpos<0)
|
||||||
*newpos = 0;
|
*newpos = size+*newpos;
|
||||||
else if (*newpos>size)
|
if (*newpos<0 || *newpos>size) {
|
||||||
*newpos = size;
|
PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
|
||||||
|
Py_DECREF(restuple);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
Py_INCREF(resunicode);
|
Py_INCREF(resunicode);
|
||||||
Py_DECREF(restuple);
|
Py_DECREF(restuple);
|
||||||
return resunicode;
|
return resunicode;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user