1 /* ------------------------------------------------------------------------
3 Python Codec Registry and support functions
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
7 Copyright (c) Corporation for National Research Initiatives.
9 ------------------------------------------------------------------------ */
14 /* --- Codec Registry ----------------------------------------------------- */
16 /* Import the standard encodings package which will register the first
17 codec search function.
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
22 ImportErrors are silently ignored by this function. Only one try is
27 static int _PyCodecRegistry_Init(void); /* Forward */
29 int PyCodec_Register(PyObject
*search_function
)
31 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
32 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
34 if (search_function
== NULL
) {
38 if (!PyCallable_Check(search_function
)) {
39 PyErr_SetString(PyExc_TypeError
, "argument must be callable");
42 return PyList_Append(interp
->codec_search_path
, search_function
);
48 /* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
52 PyObject
*normalizestring(const char *string
)
55 size_t len
= strlen(string
);
59 if (len
> PY_SSIZE_T_MAX
) {
60 PyErr_SetString(PyExc_OverflowError
, "string is too large");
64 v
= PyString_FromStringAndSize(NULL
, len
);
67 p
= PyString_AS_STRING(v
);
68 for (i
= 0; i
< len
; i
++) {
69 register char ch
= string
[i
];
73 ch
= tolower(Py_CHARMASK(ch
));
79 /* Lookup the given encoding and return a tuple providing the codec
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
86 If no codec is found, a LookupError is set and NULL returned.
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
94 PyObject
*_PyCodec_Lookup(const char *encoding
)
96 PyInterpreterState
*interp
;
97 PyObject
*result
, *args
= NULL
, *v
;
100 if (encoding
== NULL
) {
105 interp
= PyThreadState_GET()->interp
;
106 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
109 /* Convert the encoding to a normalized Python string: all
110 characters are converted to lower case, spaces and hyphens are
111 replaced with underscores. */
112 v
= normalizestring(encoding
);
115 PyString_InternInPlace(&v
);
117 /* First, try to lookup the name in the registry dictionary */
118 result
= PyDict_GetItem(interp
->codec_search_cache
, v
);
119 if (result
!= NULL
) {
125 /* Next, scan the search functions in order of registration */
126 args
= PyTuple_New(1);
129 PyTuple_SET_ITEM(args
,0,v
);
131 len
= PyList_Size(interp
->codec_search_path
);
135 PyErr_SetString(PyExc_LookupError
,
136 "no codec search functions registered: "
137 "can't find encoding");
141 for (i
= 0; i
< len
; i
++) {
144 func
= PyList_GetItem(interp
->codec_search_path
, i
);
147 result
= PyEval_CallObject(func
, args
);
150 if (result
== Py_None
) {
154 if (!PyTuple_Check(result
) || PyTuple_GET_SIZE(result
) != 4) {
155 PyErr_SetString(PyExc_TypeError
,
156 "codec search functions must return 4-tuples");
163 /* XXX Perhaps we should cache misses too ? */
164 PyErr_Format(PyExc_LookupError
,
165 "unknown encoding: %s", encoding
);
169 /* Cache and return the result */
170 PyDict_SetItem(interp
->codec_search_cache
, v
, result
);
180 PyObject
*args_tuple(PyObject
*object
,
185 args
= PyTuple_New(1 + (errors
!= NULL
));
189 PyTuple_SET_ITEM(args
,0,object
);
193 v
= PyString_FromString(errors
);
198 PyTuple_SET_ITEM(args
, 1, v
);
203 /* Helper function to get a codec item */
206 PyObject
*codec_getitem(const char *encoding
, int index
)
211 codecs
= _PyCodec_Lookup(encoding
);
214 v
= PyTuple_GET_ITEM(codecs
, index
);
220 /* Helper function to create an incremental codec. */
223 PyObject
*codec_getincrementalcodec(const char *encoding
,
225 const char *attrname
)
227 PyObject
*codecs
, *ret
, *inccodec
;
229 codecs
= _PyCodec_Lookup(encoding
);
232 inccodec
= PyObject_GetAttrString(codecs
, attrname
);
234 if (inccodec
== NULL
)
237 ret
= PyObject_CallFunction(inccodec
, "s", errors
);
239 ret
= PyObject_CallFunction(inccodec
, NULL
);
244 /* Helper function to create a stream codec. */
247 PyObject
*codec_getstreamcodec(const char *encoding
,
252 PyObject
*codecs
, *streamcodec
;
254 codecs
= _PyCodec_Lookup(encoding
);
258 streamcodec
= PyEval_CallFunction(
259 PyTuple_GET_ITEM(codecs
, index
), "Os", stream
, errors
);
264 /* Convenience APIs to query the Codec registry.
266 All APIs return a codec object with incremented refcount.
270 PyObject
*PyCodec_Encoder(const char *encoding
)
272 return codec_getitem(encoding
, 0);
275 PyObject
*PyCodec_Decoder(const char *encoding
)
277 return codec_getitem(encoding
, 1);
280 PyObject
*PyCodec_IncrementalEncoder(const char *encoding
,
283 return codec_getincrementalcodec(encoding
, errors
, "incrementalencoder");
286 PyObject
*PyCodec_IncrementalDecoder(const char *encoding
,
289 return codec_getincrementalcodec(encoding
, errors
, "incrementaldecoder");
292 PyObject
*PyCodec_StreamReader(const char *encoding
,
296 return codec_getstreamcodec(encoding
, stream
, errors
, 2);
299 PyObject
*PyCodec_StreamWriter(const char *encoding
,
303 return codec_getstreamcodec(encoding
, stream
, errors
, 3);
306 /* Encode an object (e.g. an Unicode object) using the given encoding
307 and return the resulting encoded object (usually a Python string).
309 errors is passed to the encoder factory as argument if non-NULL. */
311 PyObject
*PyCodec_Encode(PyObject
*object
,
312 const char *encoding
,
315 PyObject
*encoder
= NULL
;
316 PyObject
*args
= NULL
, *result
= NULL
;
319 encoder
= PyCodec_Encoder(encoding
);
323 args
= args_tuple(object
, errors
);
327 result
= PyEval_CallObject(encoder
,args
);
331 if (!PyTuple_Check(result
) ||
332 PyTuple_GET_SIZE(result
) != 2) {
333 PyErr_SetString(PyExc_TypeError
,
334 "encoder must return a tuple (object,integer)");
337 v
= PyTuple_GET_ITEM(result
,0);
339 /* We don't check or use the second (integer) entry. */
353 /* Decode an object (usually a Python string) using the given encoding
354 and return an equivalent object (e.g. an Unicode object).
356 errors is passed to the decoder factory as argument if non-NULL. */
358 PyObject
*PyCodec_Decode(PyObject
*object
,
359 const char *encoding
,
362 PyObject
*decoder
= NULL
;
363 PyObject
*args
= NULL
, *result
= NULL
;
366 decoder
= PyCodec_Decoder(encoding
);
370 args
= args_tuple(object
, errors
);
374 result
= PyEval_CallObject(decoder
,args
);
377 if (!PyTuple_Check(result
) ||
378 PyTuple_GET_SIZE(result
) != 2) {
379 PyErr_SetString(PyExc_TypeError
,
380 "decoder must return a tuple (object,integer)");
383 v
= PyTuple_GET_ITEM(result
,0);
385 /* We don't check or use the second (integer) entry. */
399 /* Register the error handling callback function error under the name
400 name. This function will be called by the codec when it encounters
401 an unencodable characters/undecodable bytes and doesn't know the
402 callback name, when name is specified as the error parameter
403 in the call to the encode/decode function.
404 Return 0 on success, -1 on error */
405 int PyCodec_RegisterError(const char *name
, PyObject
*error
)
407 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
408 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
410 if (!PyCallable_Check(error
)) {
411 PyErr_SetString(PyExc_TypeError
, "handler must be callable");
414 return PyDict_SetItemString(interp
->codec_error_registry
,
415 (char *)name
, error
);
418 /* Lookup the error handling callback function registered under the
419 name error. As a special case NULL can be passed, in which case
420 the error handling callback for strict encoding will be returned. */
421 PyObject
*PyCodec_LookupError(const char *name
)
423 PyObject
*handler
= NULL
;
425 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
426 if (interp
->codec_search_path
== NULL
&& _PyCodecRegistry_Init())
431 handler
= PyDict_GetItemString(interp
->codec_error_registry
, (char *)name
);
433 PyErr_Format(PyExc_LookupError
, "unknown error handler name '%.400s'", name
);
439 static void wrong_exception_type(PyObject
*exc
)
441 PyObject
*type
= PyObject_GetAttrString(exc
, "__class__");
443 PyObject
*name
= PyObject_GetAttrString(type
, "__name__");
446 PyObject
*string
= PyObject_Str(name
);
448 if (string
!= NULL
) {
449 PyErr_Format(PyExc_TypeError
,
450 "don't know how to handle %.400s in error callback",
451 PyString_AS_STRING(string
));
458 PyObject
*PyCodec_StrictErrors(PyObject
*exc
)
460 if (PyExceptionInstance_Check(exc
))
461 PyErr_SetObject(PyExceptionInstance_Class(exc
), exc
);
463 PyErr_SetString(PyExc_TypeError
, "codec must pass exception instance");
468 #ifdef Py_USING_UNICODE
469 PyObject
*PyCodec_IgnoreErrors(PyObject
*exc
)
472 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
473 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
476 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
477 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
480 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
481 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
485 wrong_exception_type(exc
);
488 /* ouch: passing NULL, 0, pos gives None instead of u'' */
489 return Py_BuildValue("(u#n)", &end
, 0, end
);
493 PyObject
*PyCodec_ReplaceErrors(PyObject
*exc
)
500 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
503 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
505 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
507 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
510 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
513 restuple
= Py_BuildValue("(On)", res
, end
);
517 else if (PyObject_IsInstance(exc
, PyExc_UnicodeDecodeError
)) {
518 Py_UNICODE res
= Py_UNICODE_REPLACEMENT_CHARACTER
;
519 if (PyUnicodeDecodeError_GetEnd(exc
, &end
))
521 return Py_BuildValue("(u#n)", &res
, 1, end
);
523 else if (PyObject_IsInstance(exc
, PyExc_UnicodeTranslateError
)) {
526 if (PyUnicodeTranslateError_GetStart(exc
, &start
))
528 if (PyUnicodeTranslateError_GetEnd(exc
, &end
))
530 res
= PyUnicode_FromUnicode(NULL
, end
-start
);
533 for (p
= PyUnicode_AS_UNICODE(res
), i
= start
;
535 *p
= Py_UNICODE_REPLACEMENT_CHARACTER
;
536 restuple
= Py_BuildValue("(On)", res
, end
);
541 wrong_exception_type(exc
);
546 PyObject
*PyCodec_XMLCharRefReplaceErrors(PyObject
*exc
)
548 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
558 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
560 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
562 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
564 startp
= PyUnicode_AS_UNICODE(object
);
565 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
574 #ifndef Py_UNICODE_WIDE
586 /* allocate replacement */
587 res
= PyUnicode_FromUnicode(NULL
, ressize
);
592 /* generate replacement */
593 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
594 p
< startp
+end
; ++p
) {
616 #ifndef Py_UNICODE_WIDE
622 else if (*p
<100000) {
626 else if (*p
<1000000) {
636 *outp
++ = '0' + c
/base
;
642 restuple
= Py_BuildValue("(On)", res
, end
);
648 wrong_exception_type(exc
);
653 static Py_UNICODE hexdigits
[] = {
654 '0', '1', '2', '3', '4', '5', '6', '7',
655 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
658 PyObject
*PyCodec_BackslashReplaceErrors(PyObject
*exc
)
660 if (PyObject_IsInstance(exc
, PyExc_UnicodeEncodeError
)) {
670 if (PyUnicodeEncodeError_GetStart(exc
, &start
))
672 if (PyUnicodeEncodeError_GetEnd(exc
, &end
))
674 if (!(object
= PyUnicodeEncodeError_GetObject(exc
)))
676 startp
= PyUnicode_AS_UNICODE(object
);
677 for (p
= startp
+start
, ressize
= 0; p
< startp
+end
; ++p
) {
678 #ifdef Py_UNICODE_WIDE
679 if (*p
>= 0x00010000)
689 res
= PyUnicode_FromUnicode(NULL
, ressize
);
692 for (p
= startp
+start
, outp
= PyUnicode_AS_UNICODE(res
);
693 p
< startp
+end
; ++p
) {
696 #ifdef Py_UNICODE_WIDE
697 if (c
>= 0x00010000) {
699 *outp
++ = hexdigits
[(c
>>28)&0xf];
700 *outp
++ = hexdigits
[(c
>>24)&0xf];
701 *outp
++ = hexdigits
[(c
>>20)&0xf];
702 *outp
++ = hexdigits
[(c
>>16)&0xf];
703 *outp
++ = hexdigits
[(c
>>12)&0xf];
704 *outp
++ = hexdigits
[(c
>>8)&0xf];
710 *outp
++ = hexdigits
[(c
>>12)&0xf];
711 *outp
++ = hexdigits
[(c
>>8)&0xf];
715 *outp
++ = hexdigits
[(c
>>4)&0xf];
716 *outp
++ = hexdigits
[c
&0xf];
719 restuple
= Py_BuildValue("(On)", res
, end
);
725 wrong_exception_type(exc
);
731 static PyObject
*strict_errors(PyObject
*self
, PyObject
*exc
)
733 return PyCodec_StrictErrors(exc
);
737 #ifdef Py_USING_UNICODE
738 static PyObject
*ignore_errors(PyObject
*self
, PyObject
*exc
)
740 return PyCodec_IgnoreErrors(exc
);
744 static PyObject
*replace_errors(PyObject
*self
, PyObject
*exc
)
746 return PyCodec_ReplaceErrors(exc
);
750 static PyObject
*xmlcharrefreplace_errors(PyObject
*self
, PyObject
*exc
)
752 return PyCodec_XMLCharRefReplaceErrors(exc
);
756 static PyObject
*backslashreplace_errors(PyObject
*self
, PyObject
*exc
)
758 return PyCodec_BackslashReplaceErrors(exc
);
762 static int _PyCodecRegistry_Init(void)
777 #ifdef Py_USING_UNICODE
797 "xmlcharrefreplace_errors",
798 xmlcharrefreplace_errors
,
805 "backslashreplace_errors",
806 backslashreplace_errors
,
813 PyInterpreterState
*interp
= PyThreadState_GET()->interp
;
817 if (interp
->codec_search_path
!= NULL
)
820 interp
->codec_search_path
= PyList_New(0);
821 interp
->codec_search_cache
= PyDict_New();
822 interp
->codec_error_registry
= PyDict_New();
824 if (interp
->codec_error_registry
) {
825 for (i
= 0; i
< sizeof(methods
)/sizeof(methods
[0]); ++i
) {
826 PyObject
*func
= PyCFunction_New(&methods
[i
].def
, NULL
);
829 Py_FatalError("can't initialize codec error registry");
830 res
= PyCodec_RegisterError(methods
[i
].name
, func
);
833 Py_FatalError("can't initialize codec error registry");
837 if (interp
->codec_search_path
== NULL
||
838 interp
->codec_search_cache
== NULL
||
839 interp
->codec_error_registry
== NULL
)
840 Py_FatalError("can't initialize codec registry");
842 mod
= PyImport_ImportModuleLevel("encodings", NULL
, NULL
, NULL
, 0);
844 if (PyErr_ExceptionMatches(PyExc_ImportError
)) {
845 /* Ignore ImportErrors... this is done so that
846 distributions can disable the encodings package. Note
847 that other errors are not masked, e.g. SystemErrors
848 raised to inform the user of an error in the Python
849 configuration are still reported back to the user. */