1 /* ------------------------------------------------------------------------
3 Python Codec Registry and support functions
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
7 Copyright (c) Corporation for National Research Initiatives.
9 ------------------------------------------------------------------------ */
14 /* --- Codec Registry ----------------------------------------------------- */
16 /* Import the standard encodings package which will register the first
17 codec search function.
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
22 ImportErrors are silently ignored by this function. Only one try is
27 static int _PyCodecRegistry_Init(void); /* Forward */
29 int PyCodec_Register(PyObject
*search_function
)
31 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
32 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
34 if (search_function
== NULL
) {
38 if (!PyCallable_Check(search_function
)) {
39 PyErr_SetString(PyExc_TypeError
, "argument must be callable");
42 return PyList_Append(interp
->codec_search_path
, search_function
);
48 /* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
52 PyObject
*normalizestring(const char *string
)
55 size_t len
= strlen(string
);
60 PyErr_SetString(PyExc_OverflowError
, "string is too large");
64 v
= PyString_FromStringAndSize(NULL
, (int)len
);
67 p
= PyString_AS_STRING(v
);
68 for (i
= 0; i
< len
; i
++) {
69 register char ch
= string
[i
];
79 /* Lookup the given encoding and return a tuple providing the codec
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
86 If no codec is found, a LookupError is set and NULL returned.
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
94 PyObject
*_PyCodec_Lookup(const char *encoding
)
96 PyInterpreterState
*interp
;
97 PyObject
*result
, *args
= NULL
, *v
;
100 if (encoding
== NULL
) {
105 interp
= PyThreadState_GET()->interp
;
106 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
109 /* Convert the encoding to a normalized Python string: all
110 characters are converted to lower case, spaces and hyphens are
111 replaced with underscores. */
112 v
= normalizestring(encoding
);
115 PyString_InternInPlace(&v
);
117 /* First, try to lookup the name in the registry dictionary */
118 result
= PyDict_GetItem(interp
->codec_search_cache
, v
);
119 if (result
!= NULL
) {
125 /* Next, scan the search functions in order of registration */
126 args
= PyTuple_New(1);
129 PyTuple_SET_ITEM(args
,0,v
);
131 len
= PyList_Size(interp
->codec_search_path
);
135 PyErr_SetString(PyExc_LookupError
,
136 "no codec search functions registered: "
137 "can't find encoding");
141 for (i
= 0; i
< len
; i
++) {
144 func
= PyList_GetItem(interp
->codec_search_path
, i
);
147 result
= PyEval_CallObject(func
, args
);
150 if (result
== Py_None
) {
154 if (!PyTuple_Check(result
) || PyTuple_GET_SIZE(result
) != 4) {
155 PyErr_SetString(PyExc_TypeError
,
156 "codec search functions must return 4-tuples");
163 /* XXX Perhaps we should cache misses too ? */
164 PyErr_Format(PyExc_LookupError
,
165 "unknown encoding: %s", encoding
);
169 /* Cache and return the result */
170 PyDict_SetItem(interp
->codec_search_cache
, v
, result
);
180 PyObject
*args_tuple(PyObject
*object
,
185 args
= PyTuple_New(1 + (errors
!= NULL
));
189 PyTuple_SET_ITEM(args
,0,object
);
193 v
= PyString_FromString(errors
);
198 PyTuple_SET_ITEM(args
, 1, v
);
203 /* Build a codec by calling factory(stream[,errors]) or just
204 factory(errors) depending on whether the given parameters are
208 PyObject
*build_stream_codec(PyObject
*factory
,
212 PyObject
*args
, *codec
;
214 args
= args_tuple(stream
, errors
);
218 codec
= PyEval_CallObject(factory
, args
);
223 /* Convenience APIs to query the Codec registry.
225 All APIs return a codec object with incremented refcount.
229 PyObject
*PyCodec_Encoder(const char *encoding
)
234 codecs
= _PyCodec_Lookup(encoding
);
237 v
= PyTuple_GET_ITEM(codecs
,0);
246 PyObject
*PyCodec_Decoder(const char *encoding
)
251 codecs
= _PyCodec_Lookup(encoding
);
254 v
= PyTuple_GET_ITEM(codecs
,1);
263 PyObject
*PyCodec_IncrementalEncoder(const char *encoding
,
266 PyObject
*codecs
, *ret
, *encoder
;
268 codecs
= _PyCodec_Lookup(encoding
);
271 encoder
= PyObject_GetAttrString(codecs
, "incrementalencoder");
272 if (encoder
== NULL
) {
277 ret
= PyObject_CallFunction(encoder
, "O", errors
);
279 ret
= PyObject_CallFunction(encoder
, NULL
);
288 PyObject
*PyCodec_IncrementalDecoder(const char *encoding
,
291 PyObject
*codecs
, *ret
, *decoder
;
293 codecs
= _PyCodec_Lookup(encoding
);
296 decoder
= PyObject_GetAttrString(codecs
, "incrementaldecoder");
297 if (decoder
== NULL
) {
302 ret
= PyObject_CallFunction(decoder
, "O", errors
);
304 ret
= PyObject_CallFunction(decoder
, NULL
);
313 PyObject
*PyCodec_StreamReader(const char *encoding
,
317 PyObject
*codecs
, *ret
;
319 codecs
= _PyCodec_Lookup(encoding
);
322 ret
= build_stream_codec(PyTuple_GET_ITEM(codecs
,2),stream
,errors
);
330 PyObject
*PyCodec_StreamWriter(const char *encoding
,
334 PyObject
*codecs
, *ret
;
336 codecs
= _PyCodec_Lookup(encoding
);
339 ret
= build_stream_codec(PyTuple_GET_ITEM(codecs
,3),stream
,errors
);
347 /* Encode an object (e.g. an Unicode object) using the given encoding
348 and return the resulting encoded object (usually a Python string).
350 errors is passed to the encoder factory as argument if non-NULL. */
352 PyObject
*PyCodec_Encode(PyObject
*object
,
353 const char *encoding
,
356 PyObject
*encoder
= NULL
;
357 PyObject
*args
= NULL
, *result
= NULL
;
360 encoder
= PyCodec_Encoder(encoding
);
364 args
= args_tuple(object
, errors
);
368 result
= PyEval_CallObject(encoder
,args
);
372 if (!PyTuple_Check(result
) ||
373 PyTuple_GET_SIZE(result
) != 2) {
374 PyErr_SetString(PyExc_TypeError
,
375 "encoder must return a tuple (object,integer)");
378 v
= PyTuple_GET_ITEM(result
,0);
380 /* We don't check or use the second (integer) entry. */
394 /* Decode an object (usually a Python string) using the given encoding
395 and return an equivalent object (e.g. an Unicode object).
397 errors is passed to the decoder factory as argument if non-NULL. */
399 PyObject
*PyCodec_Decode(PyObject
*object
,
400 const char *encoding
,
403 PyObject
*decoder
= NULL
;
404 PyObject
*args
= NULL
, *result
= NULL
;
407 decoder
= PyCodec_Decoder(encoding
);
411 args
= args_tuple(object
, errors
);
415 result
= PyEval_CallObject(decoder
,args
);
418 if (!PyTuple_Check(result
) ||
419 PyTuple_GET_SIZE(result
) != 2) {
420 PyErr_SetString(PyExc_TypeError
,
421 "decoder must return a tuple (object,integer)");
424 v
= PyTuple_GET_ITEM(result
,0);
426 /* We don't check or use the second (integer) entry. */
440 /* Register the error handling callback function error under the name
441 name. This function will be called by the codec when it encounters
442 an unencodable characters/undecodable bytes and doesn't know the
443 callback name, when name is specified as the error parameter
444 in the call to the encode/decode function.
445 Return 0 on success, -1 on error */
446 int PyCodec_RegisterError(const char *name
, PyObject
*error
)
448 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
449 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
451 if (!PyCallable_Check(error
)) {
452 PyErr_SetString(PyExc_TypeError
, "handler must be callable");
455 return PyDict_SetItemString(interp
->codec_error_registry
,
456 (char *)name
, error
);
459 /* Lookup the error handling callback function registered under the
460 name error. As a special case NULL can be passed, in which case
461 the error handling callback for strict encoding will be returned. */
462 PyObject
*PyCodec_LookupError(const char *name
)
464 PyObject
*handler
= NULL
;
466 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
467 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
472 handler
= PyDict_GetItemString(interp
->codec_error_registry
, (char *)name
);
474 PyErr_Format(PyExc_LookupError
, "unknown error handler name '%.400s'", name
);
480 static void wrong_exception_type(PyObject
*exc
)
482 PyObject
*type
= PyObject_GetAttrString(exc
, "__class__");
484 PyObject
*name
= PyObject_GetAttrString(type
, "__name__");
487 PyObject
*string
= PyObject_Str(name
);
489 if (string
!= NULL
) {
490 PyErr_Format(PyExc_TypeError
,
491 "don't know how to handle %.400s in error callback",
492 PyString_AS_STRING(string
));
499 PyObject
*PyCodec_StrictErrors(PyObject
*exc
)
501 if (PyExceptionInstance_Check(exc
))
502 PyErr_SetObject(PyExceptionInstance_Class(exc
), exc
);
504 PyErr_SetString(PyExc_TypeError
, "codec must pass exception instance");
509 #ifdef Py_USING_UNICODE
510 PyObject
*PyCodec_IgnoreErrors(PyObject
*exc
)
513 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
514 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
517 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
518 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
521 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
522 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
526 wrong_exception_type(exc
);
529 /* ouch: passing NULL, 0, pos gives None instead of u'' */
530 return Py_BuildValue("(u#n)", &end
, 0, end
);
534 PyObject
*PyCodec_ReplaceErrors(PyObject
*exc
)
541 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
544 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
546 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
548 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
551 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
554 restuple
= Py_BuildValue("(On)", res
, end
);
558 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
559 Py_UNICODE res
= Py_UNICODE_REPLACEMENT_CHARACTER
;
560 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
562 return Py_BuildValue("(u#n)", &res
, 1, end
);
564 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
567 if (PyUnicodeTranslateError_GetStart(exc
, &start
))
569 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
571 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
574 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
576 *p
= Py_UNICODE_REPLACEMENT_CHARACTER
;
577 restuple
= Py_BuildValue("(On)", res
, end
);
582 wrong_exception_type(exc
);
587 PyObject
*PyCodec_XMLCharRefReplaceErrors(PyObject
*exc
)
589 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
599 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
601 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
603 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
605 startp
= PyUnicode_AS_UNICODE(object
);
606 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
615 #ifndef Py_UNICODE_WIDE
627 /* allocate replacement */
628 res
= PyUnicode_FromUnicode(NULL
, ressize
);
633 /* generate replacement */
634 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
635 p
< startp
+end
; ++p
) {
657 #ifndef Py_UNICODE_WIDE
663 else if (*p
<100000) {
667 else if (*p
<1000000) {
677 *outp
++ = '0' + c
/base
;
683 restuple
= Py_BuildValue("(On)", res
, end
);
689 wrong_exception_type(exc
);
694 static Py_UNICODE hexdigits
[] = {
695 '0', '1', '2', '3', '4', '5', '6', '7',
696 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
699 PyObject
*PyCodec_BackslashReplaceErrors(PyObject
*exc
)
701 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
711 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
713 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
715 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
717 startp
= PyUnicode_AS_UNICODE(object
);
718 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
719 #ifdef Py_UNICODE_WIDE
720 if (*p
>= 0x00010000)
730 res
= PyUnicode_FromUnicode(NULL
, ressize
);
733 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
734 p
< startp
+end
; ++p
) {
737 #ifdef Py_UNICODE_WIDE
738 if (c
>= 0x00010000) {
740 *outp
++ = hexdigits
[(c
>>28)&0xf];
741 *outp
++ = hexdigits
[(c
>>24)&0xf];
742 *outp
++ = hexdigits
[(c
>>20)&0xf];
743 *outp
++ = hexdigits
[(c
>>16)&0xf];
744 *outp
++ = hexdigits
[(c
>>12)&0xf];
745 *outp
++ = hexdigits
[(c
>>8)&0xf];
751 *outp
++ = hexdigits
[(c
>>12)&0xf];
752 *outp
++ = hexdigits
[(c
>>8)&0xf];
756 *outp
++ = hexdigits
[(c
>>4)&0xf];
757 *outp
++ = hexdigits
[c
&0xf];
760 restuple
= Py_BuildValue("(On)", res
, end
);
766 wrong_exception_type(exc
);
772 static PyObject
*strict_errors(PyObject
*self
, PyObject
*exc
)
774 return PyCodec_StrictErrors(exc
);
778 #ifdef Py_USING_UNICODE
779 static PyObject
*ignore_errors(PyObject
*self
, PyObject
*exc
)
781 return PyCodec_IgnoreErrors(exc
);
785 static PyObject
*replace_errors(PyObject
*self
, PyObject
*exc
)
787 return PyCodec_ReplaceErrors(exc
);
791 static PyObject
*xmlcharrefreplace_errors(PyObject
*self
, PyObject
*exc
)
793 return PyCodec_XMLCharRefReplaceErrors(exc
);
797 static PyObject
*backslashreplace_errors(PyObject
*self
, PyObject
*exc
)
799 return PyCodec_BackslashReplaceErrors(exc
);
803 static int _PyCodecRegistry_Init(void)
818 #ifdef Py_USING_UNICODE
838 "xmlcharrefreplace_errors",
839 xmlcharrefreplace_errors
,
846 "backslashreplace_errors",
847 backslashreplace_errors
,
854 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
858 if (interp
->codec_search_path
!= NULL
)
861 interp
->codec_search_path
= PyList_New(0);
862 interp
->codec_search_cache
= PyDict_New();
863 interp
->codec_error_registry
= PyDict_New();
865 if (interp
->codec_error_registry
) {
866 for (i
= 0; i
< sizeof(methods
)/sizeof(methods
[0]); ++i
) {
867 PyObject
*func
= PyCFunction_New(&methods
[i
].def
, NULL
);
870 Py_FatalError("can't initialize codec error registry");
871 res
= PyCodec_RegisterError(methods
[i
].name
, func
);
874 Py_FatalError("can't initialize codec error registry");
878 if (interp
->codec_search_path
== NULL
||
879 interp
->codec_search_cache
== NULL
||
880 interp
->codec_error_registry
== NULL
)
881 Py_FatalError("can't initialize codec registry");
883 mod
= PyImport_ImportModuleLevel("encodings", NULL
, NULL
, NULL
, 0);
885 if (PyErr_ExceptionMatches(PyExc_ImportError
)) {
886 /* Ignore ImportErrors... this is done so that
887 distributions can disable the encodings package. Note
888 that other errors are not masked, e.g. SystemErrors
889 raised to inform the user of an error in the Python
890 configuration are still reported back to the user. */