1 /* ------------------------------------------------------------------------
3 Python Codec Registry and support functions
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
7 Copyright (c) Corporation for National Research Initiatives.
9 ------------------------------------------------------------------------ */
14 /* --- Codec Registry ----------------------------------------------------- */
16 /* Import the standard encodings package which will register the first
17 codec search function.
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
22 ImportErrors are silently ignored by this function. Only one try is
27 static int _PyCodecRegistry_Init(void); /* Forward */
29 int PyCodec_Register(PyObject
*search_function
)
31 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
32 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
34 if (search_function
== NULL
) {
38 if (!PyCallable_Check(search_function
)) {
39 PyErr_SetString(PyExc_TypeError
, "argument must be callable");
42 return PyList_Append(interp
->codec_search_path
, search_function
);
48 /* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
52 PyObject
*normalizestring(const char *string
)
55 size_t len
= strlen(string
);
60 PyErr_SetString(PyExc_OverflowError
, "string is too large");
64 v
= PyString_FromStringAndSize(NULL
, (int)len
);
67 p
= PyString_AS_STRING(v
);
68 for (i
= 0; i
< len
; i
++) {
69 register char ch
= string
[i
];
79 /* Lookup the given encoding and return a tuple providing the codec
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
86 If no codec is found, a LookupError is set and NULL returned.
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
94 PyObject
*_PyCodec_Lookup(const char *encoding
)
96 PyInterpreterState
*interp
;
97 PyObject
*result
, *args
= NULL
, *v
;
100 if (encoding
== NULL
) {
105 interp
= PyThreadState_GET()->interp
;
106 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
109 /* Convert the encoding to a normalized Python string: all
110 characters are converted to lower case, spaces and hyphens are
111 replaced with underscores. */
112 v
= normalizestring(encoding
);
115 PyString_InternInPlace(&v
);
117 /* First, try to lookup the name in the registry dictionary */
118 result
= PyDict_GetItem(interp
->codec_search_cache
, v
);
119 if (result
!= NULL
) {
125 /* Next, scan the search functions in order of registration */
126 args
= PyTuple_New(1);
129 PyTuple_SET_ITEM(args
,0,v
);
131 len
= PyList_Size(interp
->codec_search_path
);
135 PyErr_SetString(PyExc_LookupError
,
136 "no codec search functions registered: "
137 "can't find encoding");
141 for (i
= 0; i
< len
; i
++) {
144 func
= PyList_GetItem(interp
->codec_search_path
, i
);
147 result
= PyEval_CallObject(func
, args
);
150 if (result
== Py_None
) {
154 if (!PyTuple_Check(result
) || PyTuple_GET_SIZE(result
) != 4) {
155 PyErr_SetString(PyExc_TypeError
,
156 "codec search functions must return 4-tuples");
163 /* XXX Perhaps we should cache misses too ? */
164 PyErr_Format(PyExc_LookupError
,
165 "unknown encoding: %s", encoding
);
169 /* Cache and return the result */
170 PyDict_SetItem(interp
->codec_search_cache
, v
, result
);
180 PyObject
*args_tuple(PyObject
*object
,
185 args
= PyTuple_New(1 + (errors
!= NULL
));
189 PyTuple_SET_ITEM(args
,0,object
);
193 v
= PyString_FromString(errors
);
198 PyTuple_SET_ITEM(args
, 1, v
);
203 /* Build a codec by calling factory(stream[,errors]) or just
204 factory(errors) depending on whether the given parameters are
208 PyObject
*build_stream_codec(PyObject
*factory
,
212 PyObject
*args
, *codec
;
214 args
= args_tuple(stream
, errors
);
218 codec
= PyEval_CallObject(factory
, args
);
223 /* Convenience APIs to query the Codec registry.
225 All APIs return a codec object with incremented refcount.
229 PyObject
*PyCodec_Encoder(const char *encoding
)
234 codecs
= _PyCodec_Lookup(encoding
);
237 v
= PyTuple_GET_ITEM(codecs
,0);
246 PyObject
*PyCodec_Decoder(const char *encoding
)
251 codecs
= _PyCodec_Lookup(encoding
);
254 v
= PyTuple_GET_ITEM(codecs
,1);
263 PyObject
*PyCodec_StreamReader(const char *encoding
,
267 PyObject
*codecs
, *ret
;
269 codecs
= _PyCodec_Lookup(encoding
);
272 ret
= build_stream_codec(PyTuple_GET_ITEM(codecs
,2),stream
,errors
);
280 PyObject
*PyCodec_StreamWriter(const char *encoding
,
284 PyObject
*codecs
, *ret
;
286 codecs
= _PyCodec_Lookup(encoding
);
289 ret
= build_stream_codec(PyTuple_GET_ITEM(codecs
,3),stream
,errors
);
297 /* Encode an object (e.g. an Unicode object) using the given encoding
298 and return the resulting encoded object (usually a Python string).
300 errors is passed to the encoder factory as argument if non-NULL. */
302 PyObject
*PyCodec_Encode(PyObject
*object
,
303 const char *encoding
,
306 PyObject
*encoder
= NULL
;
307 PyObject
*args
= NULL
, *result
= NULL
;
310 encoder
= PyCodec_Encoder(encoding
);
314 args
= args_tuple(object
, errors
);
318 result
= PyEval_CallObject(encoder
,args
);
322 if (!PyTuple_Check(result
) ||
323 PyTuple_GET_SIZE(result
) != 2) {
324 PyErr_SetString(PyExc_TypeError
,
325 "encoder must return a tuple (object,integer)");
328 v
= PyTuple_GET_ITEM(result
,0);
330 /* We don't check or use the second (integer) entry. */
344 /* Decode an object (usually a Python string) using the given encoding
345 and return an equivalent object (e.g. an Unicode object).
347 errors is passed to the decoder factory as argument if non-NULL. */
349 PyObject
*PyCodec_Decode(PyObject
*object
,
350 const char *encoding
,
353 PyObject
*decoder
= NULL
;
354 PyObject
*args
= NULL
, *result
= NULL
;
357 decoder
= PyCodec_Decoder(encoding
);
361 args
= args_tuple(object
, errors
);
365 result
= PyEval_CallObject(decoder
,args
);
368 if (!PyTuple_Check(result
) ||
369 PyTuple_GET_SIZE(result
) != 2) {
370 PyErr_SetString(PyExc_TypeError
,
371 "decoder must return a tuple (object,integer)");
374 v
= PyTuple_GET_ITEM(result
,0);
376 /* We don't check or use the second (integer) entry. */
390 /* Register the error handling callback function error under the name
391 name. This function will be called by the codec when it encounters
392 an unencodable characters/undecodable bytes and doesn't know the
393 callback name, when name is specified as the error parameter
394 in the call to the encode/decode function.
395 Return 0 on success, -1 on error */
396 int PyCodec_RegisterError(const char *name
, PyObject
*error
)
398 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
399 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
401 if (!PyCallable_Check(error
)) {
402 PyErr_SetString(PyExc_TypeError
, "handler must be callable");
405 return PyDict_SetItemString(interp
->codec_error_registry
,
406 (char *)name
, error
);
409 /* Lookup the error handling callback function registered under the
410 name error. As a special case NULL can be passed, in which case
411 the error handling callback for strict encoding will be returned. */
412 PyObject
*PyCodec_LookupError(const char *name
)
414 PyObject
*handler
= NULL
;
416 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
417 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
422 handler
= PyDict_GetItemString(interp
->codec_error_registry
, (char *)name
);
424 PyErr_Format(PyExc_LookupError
, "unknown error handler name '%.400s'", name
);
430 static void wrong_exception_type(PyObject
*exc
)
432 PyObject
*type
= PyObject_GetAttrString(exc
, "__class__");
434 PyObject
*name
= PyObject_GetAttrString(type
, "__name__");
437 PyObject
*string
= PyObject_Str(name
);
439 if (string
!= NULL
) {
440 PyErr_Format(PyExc_TypeError
,
441 "don't know how to handle %.400s in error callback",
442 PyString_AS_STRING(string
));
449 PyObject
*PyCodec_StrictErrors(PyObject
*exc
)
451 if (PyInstance_Check(exc
))
452 PyErr_SetObject((PyObject
*)((PyInstanceObject
*)exc
)->in_class
,
455 PyErr_SetString(PyExc_TypeError
, "codec must pass exception instance");
460 #ifdef Py_USING_UNICODE
461 PyObject
*PyCodec_IgnoreErrors(PyObject
*exc
)
464 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
465 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
468 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
469 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
472 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
473 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
477 wrong_exception_type(exc
);
480 /* ouch: passing NULL, 0, pos gives None instead of u'' */
481 return Py_BuildValue("(u#i)", &end
, 0, end
);
485 PyObject
*PyCodec_ReplaceErrors(PyObject
*exc
)
492 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
495 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
497 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
499 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
502 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
505 restuple
= Py_BuildValue("(Oi)", res
, end
);
509 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
510 Py_UNICODE res
= Py_UNICODE_REPLACEMENT_CHARACTER
;
511 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
513 return Py_BuildValue("(u#i)", &res
, 1, end
);
515 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
518 if (PyUnicodeTranslateError_GetStart(exc
, &start
))
520 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
522 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
525 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
527 *p
= Py_UNICODE_REPLACEMENT_CHARACTER
;
528 restuple
= Py_BuildValue("(Oi)", res
, end
);
533 wrong_exception_type(exc
);
538 PyObject
*PyCodec_XMLCharRefReplaceErrors(PyObject
*exc
)
540 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
550 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
552 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
554 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
556 startp
= PyUnicode_AS_UNICODE(object
);
557 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
566 #ifndef Py_UNICODE_WIDE
578 /* allocate replacement */
579 res
= PyUnicode_FromUnicode(NULL
, ressize
);
584 /* generate replacement */
585 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
586 p
< startp
+end
; ++p
) {
608 #ifndef Py_UNICODE_WIDE
614 else if (*p
<100000) {
618 else if (*p
<1000000) {
628 *outp
++ = '0' + c
/base
;
634 restuple
= Py_BuildValue("(Oi)", res
, end
);
640 wrong_exception_type(exc
);
645 static Py_UNICODE hexdigits
[] = {
646 '0', '1', '2', '3', '4', '5', '6', '7',
647 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
650 PyObject
*PyCodec_BackslashReplaceErrors(PyObject
*exc
)
652 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
662 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
664 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
666 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
668 startp
= PyUnicode_AS_UNICODE(object
);
669 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
670 #ifdef Py_UNICODE_WIDE
671 if (*p
>= 0x00010000)
681 res
= PyUnicode_FromUnicode(NULL
, ressize
);
684 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
685 p
< startp
+end
; ++p
) {
688 #ifdef Py_UNICODE_WIDE
689 if (c
>= 0x00010000) {
691 *outp
++ = hexdigits
[(c
>>28)&0xf];
692 *outp
++ = hexdigits
[(c
>>24)&0xf];
693 *outp
++ = hexdigits
[(c
>>20)&0xf];
694 *outp
++ = hexdigits
[(c
>>16)&0xf];
695 *outp
++ = hexdigits
[(c
>>12)&0xf];
696 *outp
++ = hexdigits
[(c
>>8)&0xf];
702 *outp
++ = hexdigits
[(c
>>12)&0xf];
703 *outp
++ = hexdigits
[(c
>>8)&0xf];
707 *outp
++ = hexdigits
[(c
>>4)&0xf];
708 *outp
++ = hexdigits
[c
&0xf];
711 restuple
= Py_BuildValue("(Oi)", res
, end
);
717 wrong_exception_type(exc
);
723 static PyObject
*strict_errors(PyObject
*self
, PyObject
*exc
)
725 return PyCodec_StrictErrors(exc
);
729 #ifdef Py_USING_UNICODE
730 static PyObject
*ignore_errors(PyObject
*self
, PyObject
*exc
)
732 return PyCodec_IgnoreErrors(exc
);
736 static PyObject
*replace_errors(PyObject
*self
, PyObject
*exc
)
738 return PyCodec_ReplaceErrors(exc
);
742 static PyObject
*xmlcharrefreplace_errors(PyObject
*self
, PyObject
*exc
)
744 return PyCodec_XMLCharRefReplaceErrors(exc
);
748 static PyObject
*backslashreplace_errors(PyObject
*self
, PyObject
*exc
)
750 return PyCodec_BackslashReplaceErrors(exc
);
754 static int _PyCodecRegistry_Init(void)
769 #ifdef Py_USING_UNICODE
789 "xmlcharrefreplace_errors",
790 xmlcharrefreplace_errors
,
797 "backslashreplace_errors",
798 backslashreplace_errors
,
805 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
809 if (interp
->codec_search_path
!= NULL
)
812 interp
->codec_search_path
= PyList_New(0);
813 interp
->codec_search_cache
= PyDict_New();
814 interp
->codec_error_registry
= PyDict_New();
816 if (interp
->codec_error_registry
) {
817 for (i
= 0; i
< sizeof(methods
)/sizeof(methods
[0]); ++i
) {
818 PyObject
*func
= PyCFunction_New(&methods
[i
].def
, NULL
);
821 Py_FatalError("can't initialize codec error registry");
822 res
= PyCodec_RegisterError(methods
[i
].name
, func
);
825 Py_FatalError("can't initialize codec error registry");
829 if (interp
->codec_search_path
== NULL
||
830 interp
->codec_search_cache
== NULL
||
831 interp
->codec_error_registry
== NULL
)
832 Py_FatalError("can't initialize codec registry");
834 mod
= PyImport_ImportModuleEx("encodings", NULL
, NULL
, NULL
);
836 if (PyErr_ExceptionMatches(PyExc_ImportError
)) {
837 /* Ignore ImportErrors... this is done so that
838 distributions can disable the encodings package. Note
839 that other errors are not masked, e.g. SystemErrors
840 raised to inform the user of an error in the Python
841 configuration are still reported back to the user. */