Added a test for the ability to specify a class attribute in Formatter configuration...
[python.git] / Python / codecs.c
blob5c521fb0b4ee7522c12a1c16998fba16e1a262c9
1 /* ------------------------------------------------------------------------
3 Python Codec Registry and support functions
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
7 Copyright (c) Corporation for National Research Initiatives.
9 ------------------------------------------------------------------------ */
11 #include "Python.h"
12 #include <ctype.h>
14 /* --- Codec Registry ----------------------------------------------------- */
16 /* Import the standard encodings package which will register the first
17 codec search function.
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
22 ImportErrors are silently ignored by this function. Only one try is
23 made.
27 static int _PyCodecRegistry_Init(void); /* Forward */
29 int PyCodec_Register(PyObject *search_function)
31 PyInterpreterState *interp = PyThreadState_GET()->interp;
32 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
34 if (search_function == NULL) {
35 PyErr_BadArgument();
36 goto onError;
38 if (!PyCallable_Check(search_function)) {
39 PyErr_SetString(PyExc_TypeError, "argument must be callable");
40 goto onError;
42 return PyList_Append(interp->codec_search_path, search_function);
44 onError:
45 return -1;
48 /* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
51 static
52 PyObject *normalizestring(const char *string)
54 register size_t i;
55 size_t len = strlen(string);
56 char *p;
57 PyObject *v;
59 if (len > INT_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
64 v = PyString_FromStringAndSize(NULL, (int)len);
65 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
68 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
73 ch = tolower(ch);
74 p[i] = ch;
76 return v;
79 /* Lookup the given encoding and return a tuple providing the codec
80 facilities.
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
86 If no codec is found, a LookupError is set and NULL returned.
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
94 PyObject *_PyCodec_Lookup(const char *encoding)
96 PyInterpreterState *interp;
97 PyObject *result, *args = NULL, *v;
98 int i, len;
100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
105 interp = PyThreadState_GET()->interp;
106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
107 goto onError;
109 /* Convert the encoding to a normalized Python string: all
110 characters are converted to lower case, spaces and hyphens are
111 replaced with underscores. */
112 v = normalizestring(encoding);
113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
117 /* First, try to lookup the name in the registry dictionary */
118 result = PyDict_GetItem(interp->codec_search_cache, v);
119 if (result != NULL) {
120 Py_INCREF(result);
121 Py_DECREF(v);
122 return result;
125 /* Next, scan the search functions in order of registration */
126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
131 len = PyList_Size(interp->codec_search_path);
132 if (len < 0)
133 goto onError;
134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
141 for (i = 0; i < len; i++) {
142 PyObject *func;
144 func = PyList_GetItem(interp->codec_search_path, i);
145 if (func == NULL)
146 goto onError;
147 result = PyEval_CallObject(func, args);
148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
160 break;
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
166 goto onError;
169 /* Cache and return the result */
170 PyDict_SetItem(interp->codec_search_cache, v, result);
171 Py_DECREF(args);
172 return result;
174 onError:
175 Py_XDECREF(args);
176 return NULL;
179 static
180 PyObject *args_tuple(PyObject *object,
181 const char *errors)
183 PyObject *args;
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
198 PyTuple_SET_ITEM(args, 1, v);
200 return args;
203 /* Build a codec by calling factory(stream[,errors]) or just
204 factory(errors) depending on whether the given parameters are
205 non-NULL. */
207 static
208 PyObject *build_stream_codec(PyObject *factory,
209 PyObject *stream,
210 const char *errors)
212 PyObject *args, *codec;
214 args = args_tuple(stream, errors);
215 if (args == NULL)
216 return NULL;
218 codec = PyEval_CallObject(factory, args);
219 Py_DECREF(args);
220 return codec;
223 /* Convenience APIs to query the Codec registry.
225 All APIs return a codec object with incremented refcount.
229 PyObject *PyCodec_Encoder(const char *encoding)
231 PyObject *codecs;
232 PyObject *v;
234 codecs = _PyCodec_Lookup(encoding);
235 if (codecs == NULL)
236 goto onError;
237 v = PyTuple_GET_ITEM(codecs,0);
238 Py_DECREF(codecs);
239 Py_INCREF(v);
240 return v;
242 onError:
243 return NULL;
246 PyObject *PyCodec_Decoder(const char *encoding)
248 PyObject *codecs;
249 PyObject *v;
251 codecs = _PyCodec_Lookup(encoding);
252 if (codecs == NULL)
253 goto onError;
254 v = PyTuple_GET_ITEM(codecs,1);
255 Py_DECREF(codecs);
256 Py_INCREF(v);
257 return v;
259 onError:
260 return NULL;
263 PyObject *PyCodec_StreamReader(const char *encoding,
264 PyObject *stream,
265 const char *errors)
267 PyObject *codecs, *ret;
269 codecs = _PyCodec_Lookup(encoding);
270 if (codecs == NULL)
271 goto onError;
272 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
273 Py_DECREF(codecs);
274 return ret;
276 onError:
277 return NULL;
280 PyObject *PyCodec_StreamWriter(const char *encoding,
281 PyObject *stream,
282 const char *errors)
284 PyObject *codecs, *ret;
286 codecs = _PyCodec_Lookup(encoding);
287 if (codecs == NULL)
288 goto onError;
289 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
290 Py_DECREF(codecs);
291 return ret;
293 onError:
294 return NULL;
297 /* Encode an object (e.g. an Unicode object) using the given encoding
298 and return the resulting encoded object (usually a Python string).
300 errors is passed to the encoder factory as argument if non-NULL. */
302 PyObject *PyCodec_Encode(PyObject *object,
303 const char *encoding,
304 const char *errors)
306 PyObject *encoder = NULL;
307 PyObject *args = NULL, *result = NULL;
308 PyObject *v;
310 encoder = PyCodec_Encoder(encoding);
311 if (encoder == NULL)
312 goto onError;
314 args = args_tuple(object, errors);
315 if (args == NULL)
316 goto onError;
318 result = PyEval_CallObject(encoder,args);
319 if (result == NULL)
320 goto onError;
322 if (!PyTuple_Check(result) ||
323 PyTuple_GET_SIZE(result) != 2) {
324 PyErr_SetString(PyExc_TypeError,
325 "encoder must return a tuple (object,integer)");
326 goto onError;
328 v = PyTuple_GET_ITEM(result,0);
329 Py_INCREF(v);
330 /* We don't check or use the second (integer) entry. */
332 Py_DECREF(args);
333 Py_DECREF(encoder);
334 Py_DECREF(result);
335 return v;
337 onError:
338 Py_XDECREF(result);
339 Py_XDECREF(args);
340 Py_XDECREF(encoder);
341 return NULL;
344 /* Decode an object (usually a Python string) using the given encoding
345 and return an equivalent object (e.g. an Unicode object).
347 errors is passed to the decoder factory as argument if non-NULL. */
349 PyObject *PyCodec_Decode(PyObject *object,
350 const char *encoding,
351 const char *errors)
353 PyObject *decoder = NULL;
354 PyObject *args = NULL, *result = NULL;
355 PyObject *v;
357 decoder = PyCodec_Decoder(encoding);
358 if (decoder == NULL)
359 goto onError;
361 args = args_tuple(object, errors);
362 if (args == NULL)
363 goto onError;
365 result = PyEval_CallObject(decoder,args);
366 if (result == NULL)
367 goto onError;
368 if (!PyTuple_Check(result) ||
369 PyTuple_GET_SIZE(result) != 2) {
370 PyErr_SetString(PyExc_TypeError,
371 "decoder must return a tuple (object,integer)");
372 goto onError;
374 v = PyTuple_GET_ITEM(result,0);
375 Py_INCREF(v);
376 /* We don't check or use the second (integer) entry. */
378 Py_DECREF(args);
379 Py_DECREF(decoder);
380 Py_DECREF(result);
381 return v;
383 onError:
384 Py_XDECREF(args);
385 Py_XDECREF(decoder);
386 Py_XDECREF(result);
387 return NULL;
390 /* Register the error handling callback function error under the name
391 name. This function will be called by the codec when it encounters
392 an unencodable characters/undecodable bytes and doesn't know the
393 callback name, when name is specified as the error parameter
394 in the call to the encode/decode function.
395 Return 0 on success, -1 on error */
396 int PyCodec_RegisterError(const char *name, PyObject *error)
398 PyInterpreterState *interp = PyThreadState_GET()->interp;
399 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
400 return -1;
401 if (!PyCallable_Check(error)) {
402 PyErr_SetString(PyExc_TypeError, "handler must be callable");
403 return -1;
405 return PyDict_SetItemString(interp->codec_error_registry,
406 (char *)name, error);
409 /* Lookup the error handling callback function registered under the
410 name error. As a special case NULL can be passed, in which case
411 the error handling callback for strict encoding will be returned. */
412 PyObject *PyCodec_LookupError(const char *name)
414 PyObject *handler = NULL;
416 PyInterpreterState *interp = PyThreadState_GET()->interp;
417 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
418 return NULL;
420 if (name==NULL)
421 name = "strict";
422 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
423 if (!handler)
424 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
425 else
426 Py_INCREF(handler);
427 return handler;
430 static void wrong_exception_type(PyObject *exc)
432 PyObject *type = PyObject_GetAttrString(exc, "__class__");
433 if (type != NULL) {
434 PyObject *name = PyObject_GetAttrString(type, "__name__");
435 Py_DECREF(type);
436 if (name != NULL) {
437 PyObject *string = PyObject_Str(name);
438 Py_DECREF(name);
439 if (string != NULL) {
440 PyErr_Format(PyExc_TypeError,
441 "don't know how to handle %.400s in error callback",
442 PyString_AS_STRING(string));
443 Py_DECREF(string);
449 PyObject *PyCodec_StrictErrors(PyObject *exc)
451 if (PyInstance_Check(exc))
452 PyErr_SetObject((PyObject*)((PyInstanceObject*)exc)->in_class,
453 exc);
454 else
455 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
456 return NULL;
460 #ifdef Py_USING_UNICODE
461 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
463 int end;
464 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
465 if (PyUnicodeEncodeError_GetEnd(exc, &end))
466 return NULL;
468 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
469 if (PyUnicodeDecodeError_GetEnd(exc, &end))
470 return NULL;
472 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
473 if (PyUnicodeTranslateError_GetEnd(exc, &end))
474 return NULL;
476 else {
477 wrong_exception_type(exc);
478 return NULL;
480 /* ouch: passing NULL, 0, pos gives None instead of u'' */
481 return Py_BuildValue("(u#i)", &end, 0, end);
485 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
487 PyObject *restuple;
488 int start;
489 int end;
490 int i;
492 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
493 PyObject *res;
494 Py_UNICODE *p;
495 if (PyUnicodeEncodeError_GetStart(exc, &start))
496 return NULL;
497 if (PyUnicodeEncodeError_GetEnd(exc, &end))
498 return NULL;
499 res = PyUnicode_FromUnicode(NULL, end-start);
500 if (res == NULL)
501 return NULL;
502 for (p = PyUnicode_AS_UNICODE(res), i = start;
503 i<end; ++p, ++i)
504 *p = '?';
505 restuple = Py_BuildValue("(Oi)", res, end);
506 Py_DECREF(res);
507 return restuple;
509 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
510 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
511 if (PyUnicodeDecodeError_GetEnd(exc, &end))
512 return NULL;
513 return Py_BuildValue("(u#i)", &res, 1, end);
515 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
516 PyObject *res;
517 Py_UNICODE *p;
518 if (PyUnicodeTranslateError_GetStart(exc, &start))
519 return NULL;
520 if (PyUnicodeTranslateError_GetEnd(exc, &end))
521 return NULL;
522 res = PyUnicode_FromUnicode(NULL, end-start);
523 if (res == NULL)
524 return NULL;
525 for (p = PyUnicode_AS_UNICODE(res), i = start;
526 i<end; ++p, ++i)
527 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
528 restuple = Py_BuildValue("(Oi)", res, end);
529 Py_DECREF(res);
530 return restuple;
532 else {
533 wrong_exception_type(exc);
534 return NULL;
538 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
540 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
541 PyObject *restuple;
542 PyObject *object;
543 int start;
544 int end;
545 PyObject *res;
546 Py_UNICODE *p;
547 Py_UNICODE *startp;
548 Py_UNICODE *outp;
549 int ressize;
550 if (PyUnicodeEncodeError_GetStart(exc, &start))
551 return NULL;
552 if (PyUnicodeEncodeError_GetEnd(exc, &end))
553 return NULL;
554 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
555 return NULL;
556 startp = PyUnicode_AS_UNICODE(object);
557 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
558 if (*p<10)
559 ressize += 2+1+1;
560 else if (*p<100)
561 ressize += 2+2+1;
562 else if (*p<1000)
563 ressize += 2+3+1;
564 else if (*p<10000)
565 ressize += 2+4+1;
566 #ifndef Py_UNICODE_WIDE
567 else
568 ressize += 2+5+1;
569 #else
570 else if (*p<100000)
571 ressize += 2+5+1;
572 else if (*p<1000000)
573 ressize += 2+6+1;
574 else
575 ressize += 2+7+1;
576 #endif
578 /* allocate replacement */
579 res = PyUnicode_FromUnicode(NULL, ressize);
580 if (res == NULL) {
581 Py_DECREF(object);
582 return NULL;
584 /* generate replacement */
585 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
586 p < startp+end; ++p) {
587 Py_UNICODE c = *p;
588 int digits;
589 int base;
590 *outp++ = '&';
591 *outp++ = '#';
592 if (*p<10) {
593 digits = 1;
594 base = 1;
596 else if (*p<100) {
597 digits = 2;
598 base = 10;
600 else if (*p<1000) {
601 digits = 3;
602 base = 100;
604 else if (*p<10000) {
605 digits = 4;
606 base = 1000;
608 #ifndef Py_UNICODE_WIDE
609 else {
610 digits = 5;
611 base = 10000;
613 #else
614 else if (*p<100000) {
615 digits = 5;
616 base = 10000;
618 else if (*p<1000000) {
619 digits = 6;
620 base = 100000;
622 else {
623 digits = 7;
624 base = 1000000;
626 #endif
627 while (digits-->0) {
628 *outp++ = '0' + c/base;
629 c %= base;
630 base /= 10;
632 *outp++ = ';';
634 restuple = Py_BuildValue("(Oi)", res, end);
635 Py_DECREF(res);
636 Py_DECREF(object);
637 return restuple;
639 else {
640 wrong_exception_type(exc);
641 return NULL;
645 static Py_UNICODE hexdigits[] = {
646 '0', '1', '2', '3', '4', '5', '6', '7',
647 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
650 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
652 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
653 PyObject *restuple;
654 PyObject *object;
655 int start;
656 int end;
657 PyObject *res;
658 Py_UNICODE *p;
659 Py_UNICODE *startp;
660 Py_UNICODE *outp;
661 int ressize;
662 if (PyUnicodeEncodeError_GetStart(exc, &start))
663 return NULL;
664 if (PyUnicodeEncodeError_GetEnd(exc, &end))
665 return NULL;
666 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
667 return NULL;
668 startp = PyUnicode_AS_UNICODE(object);
669 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
670 #ifdef Py_UNICODE_WIDE
671 if (*p >= 0x00010000)
672 ressize += 1+1+8;
673 else
674 #endif
675 if (*p >= 0x100) {
676 ressize += 1+1+4;
678 else
679 ressize += 1+1+2;
681 res = PyUnicode_FromUnicode(NULL, ressize);
682 if (res==NULL)
683 return NULL;
684 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
685 p < startp+end; ++p) {
686 Py_UNICODE c = *p;
687 *outp++ = '\\';
688 #ifdef Py_UNICODE_WIDE
689 if (c >= 0x00010000) {
690 *outp++ = 'U';
691 *outp++ = hexdigits[(c>>28)&0xf];
692 *outp++ = hexdigits[(c>>24)&0xf];
693 *outp++ = hexdigits[(c>>20)&0xf];
694 *outp++ = hexdigits[(c>>16)&0xf];
695 *outp++ = hexdigits[(c>>12)&0xf];
696 *outp++ = hexdigits[(c>>8)&0xf];
698 else
699 #endif
700 if (c >= 0x100) {
701 *outp++ = 'u';
702 *outp++ = hexdigits[(c>>12)&0xf];
703 *outp++ = hexdigits[(c>>8)&0xf];
705 else
706 *outp++ = 'x';
707 *outp++ = hexdigits[(c>>4)&0xf];
708 *outp++ = hexdigits[c&0xf];
711 restuple = Py_BuildValue("(Oi)", res, end);
712 Py_DECREF(res);
713 Py_DECREF(object);
714 return restuple;
716 else {
717 wrong_exception_type(exc);
718 return NULL;
721 #endif
723 static PyObject *strict_errors(PyObject *self, PyObject *exc)
725 return PyCodec_StrictErrors(exc);
729 #ifdef Py_USING_UNICODE
730 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
732 return PyCodec_IgnoreErrors(exc);
736 static PyObject *replace_errors(PyObject *self, PyObject *exc)
738 return PyCodec_ReplaceErrors(exc);
742 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
744 return PyCodec_XMLCharRefReplaceErrors(exc);
748 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
750 return PyCodec_BackslashReplaceErrors(exc);
752 #endif
754 static int _PyCodecRegistry_Init(void)
756 static struct {
757 char *name;
758 PyMethodDef def;
759 } methods[] =
762 "strict",
764 "strict_errors",
765 strict_errors,
766 METH_O
769 #ifdef Py_USING_UNICODE
771 "ignore",
773 "ignore_errors",
774 ignore_errors,
775 METH_O
779 "replace",
781 "replace_errors",
782 replace_errors,
783 METH_O
787 "xmlcharrefreplace",
789 "xmlcharrefreplace_errors",
790 xmlcharrefreplace_errors,
791 METH_O
795 "backslashreplace",
797 "backslashreplace_errors",
798 backslashreplace_errors,
799 METH_O
802 #endif
805 PyInterpreterState *interp = PyThreadState_GET()->interp;
806 PyObject *mod;
807 unsigned i;
809 if (interp->codec_search_path != NULL)
810 return 0;
812 interp->codec_search_path = PyList_New(0);
813 interp->codec_search_cache = PyDict_New();
814 interp->codec_error_registry = PyDict_New();
816 if (interp->codec_error_registry) {
817 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
818 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
819 int res;
820 if (!func)
821 Py_FatalError("can't initialize codec error registry");
822 res = PyCodec_RegisterError(methods[i].name, func);
823 Py_DECREF(func);
824 if (res)
825 Py_FatalError("can't initialize codec error registry");
829 if (interp->codec_search_path == NULL ||
830 interp->codec_search_cache == NULL ||
831 interp->codec_error_registry == NULL)
832 Py_FatalError("can't initialize codec registry");
834 mod = PyImport_ImportModuleEx("encodings", NULL, NULL, NULL);
835 if (mod == NULL) {
836 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
837 /* Ignore ImportErrors... this is done so that
838 distributions can disable the encodings package. Note
839 that other errors are not masked, e.g. SystemErrors
840 raised to inform the user of an error in the Python
841 configuration are still reported back to the user. */
842 PyErr_Clear();
843 return 0;
845 return -1;
847 Py_DECREF(mod);
848 return 0;