1 /* ------------------------------------------------------------------------
3 Python Codec Registry and support functions
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
7 Copyright (c) Corporation for National Research Initiatives.
9 ------------------------------------------------------------------------ */
14 /* --- Codec Registry ----------------------------------------------------- */
16 /* Import the standard encodings package which will register the first
17 codec search function.
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
22 ImportErrors are silently ignored by this function. Only one try is
27 static int _PyCodecRegistry_Init(void); /* Forward */
29 int PyCodec_Register(PyObject
*search_function
)
31 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
32 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
34 if (search_function
== NULL
) {
38 if (!PyCallable_Check(search_function
)) {
39 PyErr_SetString(PyExc_TypeError
, "argument must be callable");
42 return PyList_Append(interp
->codec_search_path
, search_function
);
48 /* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
52 PyObject
*normalizestring(const char *string
)
55 size_t len
= strlen(string
);
59 if (len
> PY_SSIZE_T_MAX
) {
60 PyErr_SetString(PyExc_OverflowError
, "string is too large");
64 v
= PyString_FromStringAndSize(NULL
, len
);
67 p
= PyString_AS_STRING(v
);
68 for (i
= 0; i
< len
; i
++) {
69 register char ch
= string
[i
];
73 ch
= tolower(Py_CHARMASK(ch
));
79 /* Lookup the given encoding and return a tuple providing the codec
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
86 If no codec is found, a LookupError is set and NULL returned.
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
94 PyObject
*_PyCodec_Lookup(const char *encoding
)
96 PyInterpreterState
*interp
;
97 PyObject
*result
, *args
= NULL
, *v
;
100 if (encoding
== NULL
) {
105 interp
= PyThreadState_GET()->interp
;
106 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
109 /* Convert the encoding to a normalized Python string: all
110 characters are converted to lower case, spaces and hyphens are
111 replaced with underscores. */
112 v
= normalizestring(encoding
);
115 PyString_InternInPlace(&v
);
117 /* First, try to lookup the name in the registry dictionary */
118 result
= PyDict_GetItem(interp
->codec_search_cache
, v
);
119 if (result
!= NULL
) {
125 /* Next, scan the search functions in order of registration */
126 args
= PyTuple_New(1);
129 PyTuple_SET_ITEM(args
,0,v
);
131 len
= PyList_Size(interp
->codec_search_path
);
135 PyErr_SetString(PyExc_LookupError
,
136 "no codec search functions registered: "
137 "can't find encoding");
141 for (i
= 0; i
< len
; i
++) {
144 func
= PyList_GetItem(interp
->codec_search_path
, i
);
147 result
= PyEval_CallObject(func
, args
);
150 if (result
== Py_None
) {
154 if (!PyTuple_Check(result
) || PyTuple_GET_SIZE(result
) != 4) {
155 PyErr_SetString(PyExc_TypeError
,
156 "codec search functions must return 4-tuples");
163 /* XXX Perhaps we should cache misses too ? */
164 PyErr_Format(PyExc_LookupError
,
165 "unknown encoding: %s", encoding
);
169 /* Cache and return the result */
170 PyDict_SetItem(interp
->codec_search_cache
, v
, result
);
180 PyObject
*args_tuple(PyObject
*object
,
185 args
= PyTuple_New(1 + (errors
!= NULL
));
189 PyTuple_SET_ITEM(args
,0,object
);
193 v
= PyString_FromString(errors
);
198 PyTuple_SET_ITEM(args
, 1, v
);
203 /* Helper function to get a codec item */
206 PyObject
*codec_getitem(const char *encoding
, int index
)
211 codecs
= _PyCodec_Lookup(encoding
);
214 v
= PyTuple_GET_ITEM(codecs
, index
);
220 /* Helper function to create an incremental codec. */
223 PyObject
*codec_getincrementalcodec(const char *encoding
,
225 const char *attrname
)
227 PyObject
*codecs
, *ret
, *inccodec
;
229 codecs
= _PyCodec_Lookup(encoding
);
232 inccodec
= PyObject_GetAttrString(codecs
, attrname
);
234 if (inccodec
== NULL
)
237 ret
= PyObject_CallFunction(inccodec
, "s", errors
);
239 ret
= PyObject_CallFunction(inccodec
, NULL
);
244 /* Helper function to create a stream codec. */
247 PyObject
*codec_getstreamcodec(const char *encoding
,
252 PyObject
*codecs
, *streamcodec
, *codeccls
;
254 codecs
= _PyCodec_Lookup(encoding
);
258 codeccls
= PyTuple_GET_ITEM(codecs
, index
);
260 streamcodec
= PyObject_CallFunction(codeccls
, "Os", stream
, errors
);
262 streamcodec
= PyObject_CallFunction(codeccls
, "O", stream
);
267 /* Convenience APIs to query the Codec registry.
269 All APIs return a codec object with incremented refcount.
273 PyObject
*PyCodec_Encoder(const char *encoding
)
275 return codec_getitem(encoding
, 0);
278 PyObject
*PyCodec_Decoder(const char *encoding
)
280 return codec_getitem(encoding
, 1);
283 PyObject
*PyCodec_IncrementalEncoder(const char *encoding
,
286 return codec_getincrementalcodec(encoding
, errors
, "incrementalencoder");
289 PyObject
*PyCodec_IncrementalDecoder(const char *encoding
,
292 return codec_getincrementalcodec(encoding
, errors
, "incrementaldecoder");
295 PyObject
*PyCodec_StreamReader(const char *encoding
,
299 return codec_getstreamcodec(encoding
, stream
, errors
, 2);
302 PyObject
*PyCodec_StreamWriter(const char *encoding
,
306 return codec_getstreamcodec(encoding
, stream
, errors
, 3);
309 /* Encode an object (e.g. an Unicode object) using the given encoding
310 and return the resulting encoded object (usually a Python string).
312 errors is passed to the encoder factory as argument if non-NULL. */
314 PyObject
*PyCodec_Encode(PyObject
*object
,
315 const char *encoding
,
318 PyObject
*encoder
= NULL
;
319 PyObject
*args
= NULL
, *result
= NULL
;
322 encoder
= PyCodec_Encoder(encoding
);
326 args
= args_tuple(object
, errors
);
330 result
= PyEval_CallObject(encoder
,args
);
334 if (!PyTuple_Check(result
) ||
335 PyTuple_GET_SIZE(result
) != 2) {
336 PyErr_SetString(PyExc_TypeError
,
337 "encoder must return a tuple (object,integer)");
340 v
= PyTuple_GET_ITEM(result
,0);
342 /* We don't check or use the second (integer) entry. */
356 /* Decode an object (usually a Python string) using the given encoding
357 and return an equivalent object (e.g. an Unicode object).
359 errors is passed to the decoder factory as argument if non-NULL. */
361 PyObject
*PyCodec_Decode(PyObject
*object
,
362 const char *encoding
,
365 PyObject
*decoder
= NULL
;
366 PyObject
*args
= NULL
, *result
= NULL
;
369 decoder
= PyCodec_Decoder(encoding
);
373 args
= args_tuple(object
, errors
);
377 result
= PyEval_CallObject(decoder
,args
);
380 if (!PyTuple_Check(result
) ||
381 PyTuple_GET_SIZE(result
) != 2) {
382 PyErr_SetString(PyExc_TypeError
,
383 "decoder must return a tuple (object,integer)");
386 v
= PyTuple_GET_ITEM(result
,0);
388 /* We don't check or use the second (integer) entry. */
402 /* Register the error handling callback function error under the name
403 name. This function will be called by the codec when it encounters
404 an unencodable characters/undecodable bytes and doesn't know the
405 callback name, when name is specified as the error parameter
406 in the call to the encode/decode function.
407 Return 0 on success, -1 on error */
408 int PyCodec_RegisterError(const char *name
, PyObject
*error
)
410 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
411 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
413 if (!PyCallable_Check(error
)) {
414 PyErr_SetString(PyExc_TypeError
, "handler must be callable");
417 return PyDict_SetItemString(interp
->codec_error_registry
,
418 (char *)name
, error
);
421 /* Lookup the error handling callback function registered under the
422 name error. As a special case NULL can be passed, in which case
423 the error handling callback for strict encoding will be returned. */
424 PyObject
*PyCodec_LookupError(const char *name
)
426 PyObject
*handler
= NULL
;
428 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
429 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
434 handler
= PyDict_GetItemString(interp
->codec_error_registry
, (char *)name
);
436 PyErr_Format(PyExc_LookupError
, "unknown error handler name '%.400s'", name
);
442 static void wrong_exception_type(PyObject
*exc
)
444 PyObject
*type
= PyObject_GetAttrString(exc
, "__class__");
446 PyObject
*name
= PyObject_GetAttrString(type
, "__name__");
449 PyObject
*string
= PyObject_Str(name
);
451 if (string
!= NULL
) {
452 PyErr_Format(PyExc_TypeError
,
453 "don't know how to handle %.400s in error callback",
454 PyString_AS_STRING(string
));
461 PyObject
*PyCodec_StrictErrors(PyObject
*exc
)
463 if (PyExceptionInstance_Check(exc
))
464 PyErr_SetObject(PyExceptionInstance_Class(exc
), exc
);
466 PyErr_SetString(PyExc_TypeError
, "codec must pass exception instance");
471 #ifdef Py_USING_UNICODE
472 PyObject
*PyCodec_IgnoreErrors(PyObject
*exc
)
475 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
476 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
479 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
480 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
483 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
484 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
488 wrong_exception_type(exc
);
491 /* ouch: passing NULL, 0, pos gives None instead of u'' */
492 return Py_BuildValue("(u#n)", &end
, 0, end
);
496 PyObject
*PyCodec_ReplaceErrors(PyObject
*exc
)
503 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
506 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
508 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
510 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
513 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
516 restuple
= Py_BuildValue("(On)", res
, end
);
520 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
521 Py_UNICODE res
= Py_UNICODE_REPLACEMENT_CHARACTER
;
522 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
524 return Py_BuildValue("(u#n)", &res
, 1, end
);
526 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
529 if (PyUnicodeTranslateError_GetStart(exc
, &start
))
531 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
533 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
536 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
538 *p
= Py_UNICODE_REPLACEMENT_CHARACTER
;
539 restuple
= Py_BuildValue("(On)", res
, end
);
544 wrong_exception_type(exc
);
549 PyObject
*PyCodec_XMLCharRefReplaceErrors(PyObject
*exc
)
551 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
561 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
563 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
565 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
567 startp
= PyUnicode_AS_UNICODE(object
);
568 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
577 #ifndef Py_UNICODE_WIDE
589 /* allocate replacement */
590 res
= PyUnicode_FromUnicode(NULL
, ressize
);
595 /* generate replacement */
596 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
597 p
< startp
+end
; ++p
) {
619 #ifndef Py_UNICODE_WIDE
625 else if (*p
<100000) {
629 else if (*p
<1000000) {
639 *outp
++ = '0' + c
/base
;
645 restuple
= Py_BuildValue("(On)", res
, end
);
651 wrong_exception_type(exc
);
656 static Py_UNICODE hexdigits
[] = {
657 '0', '1', '2', '3', '4', '5', '6', '7',
658 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
661 PyObject
*PyCodec_BackslashReplaceErrors(PyObject
*exc
)
663 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
673 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
675 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
677 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
679 startp
= PyUnicode_AS_UNICODE(object
);
680 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
681 #ifdef Py_UNICODE_WIDE
682 if (*p
>= 0x00010000)
692 res
= PyUnicode_FromUnicode(NULL
, ressize
);
695 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
696 p
< startp
+end
; ++p
) {
699 #ifdef Py_UNICODE_WIDE
700 if (c
>= 0x00010000) {
702 *outp
++ = hexdigits
[(c
>>28)&0xf];
703 *outp
++ = hexdigits
[(c
>>24)&0xf];
704 *outp
++ = hexdigits
[(c
>>20)&0xf];
705 *outp
++ = hexdigits
[(c
>>16)&0xf];
706 *outp
++ = hexdigits
[(c
>>12)&0xf];
707 *outp
++ = hexdigits
[(c
>>8)&0xf];
713 *outp
++ = hexdigits
[(c
>>12)&0xf];
714 *outp
++ = hexdigits
[(c
>>8)&0xf];
718 *outp
++ = hexdigits
[(c
>>4)&0xf];
719 *outp
++ = hexdigits
[c
&0xf];
722 restuple
= Py_BuildValue("(On)", res
, end
);
728 wrong_exception_type(exc
);
734 static PyObject
*strict_errors(PyObject
*self
, PyObject
*exc
)
736 return PyCodec_StrictErrors(exc
);
740 #ifdef Py_USING_UNICODE
741 static PyObject
*ignore_errors(PyObject
*self
, PyObject
*exc
)
743 return PyCodec_IgnoreErrors(exc
);
747 static PyObject
*replace_errors(PyObject
*self
, PyObject
*exc
)
749 return PyCodec_ReplaceErrors(exc
);
753 static PyObject
*xmlcharrefreplace_errors(PyObject
*self
, PyObject
*exc
)
755 return PyCodec_XMLCharRefReplaceErrors(exc
);
759 static PyObject
*backslashreplace_errors(PyObject
*self
, PyObject
*exc
)
761 return PyCodec_BackslashReplaceErrors(exc
);
765 static int _PyCodecRegistry_Init(void)
778 PyDoc_STR("Implements the 'strict' error handling, which "
779 "raises a UnicodeError on coding errors.")
782 #ifdef Py_USING_UNICODE
789 PyDoc_STR("Implements the 'ignore' error handling, which "
790 "ignores malformed data and continues.")
799 PyDoc_STR("Implements the 'replace' error handling, which "
800 "replaces malformed data with a replacement marker.")
806 "xmlcharrefreplace_errors",
807 xmlcharrefreplace_errors
,
809 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
810 "which replaces an unencodable character with the "
811 "appropriate XML character reference.")
817 "backslashreplace_errors",
818 backslashreplace_errors
,
820 PyDoc_STR("Implements the 'backslashreplace' error handling, "
821 "which replaces an unencodable character with a "
822 "backslashed escape sequence.")
828 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
832 if (interp
->codec_search_path
!= NULL
)
835 interp
->codec_search_path
= PyList_New(0);
836 interp
->codec_search_cache
= PyDict_New();
837 interp
->codec_error_registry
= PyDict_New();
839 if (interp
->codec_error_registry
) {
840 for (i
= 0; i
< sizeof(methods
)/sizeof(methods
[0]); ++i
) {
841 PyObject
*func
= PyCFunction_New(&methods
[i
].def
, NULL
);
844 Py_FatalError("can't initialize codec error registry");
845 res
= PyCodec_RegisterError(methods
[i
].name
, func
);
848 Py_FatalError("can't initialize codec error registry");
852 if (interp
->codec_search_path
== NULL
||
853 interp
->codec_search_cache
== NULL
||
854 interp
->codec_error_registry
== NULL
)
855 Py_FatalError("can't initialize codec registry");
857 mod
= PyImport_ImportModuleLevel("encodings", NULL
, NULL
, NULL
, 0);
859 if (PyErr_ExceptionMatches(PyExc_ImportError
)) {
860 /* Ignore ImportErrors... this is done so that
861 distributions can disable the encodings package. Note
862 that other errors are not masked, e.g. SystemErrors
863 raised to inform the user of an error in the Python
864 configuration are still reported back to the user. */