1 /* ------------------------------------------------------------------------
3 _codecs -- Provides access to the codec registry and the builtin
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
9 The codec registry is accessible via:
11 register(search_function) -> None
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
15 The builtin Unicode codecs use the following interface:
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
34 Copyright (c) Corporation for National Research Initiatives.
36 ------------------------------------------------------------------------ */
38 #define PY_SSIZE_T_CLEAN
41 /* --- Registry ----------------------------------------------------------- */
43 PyDoc_STRVAR(register__doc__
,
44 "register(search_function)\n\
46 Register a codec search function. Search functions are expected to take\n\
47 one argument, the encoding name in all lower case letters, and return\n\
48 a tuple of functions (encoder, decoder, stream_reader, stream_writer).");
51 PyObject
*codec_register(PyObject
*self
, PyObject
*search_function
)
53 if (PyCodec_Register(search_function
))
59 PyDoc_STRVAR(lookup__doc__
,
60 "lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)\n\
62 Looks up a codec tuple in the Python codec registry and returns\n\
63 a tuple of functions.");
66 PyObject
*codec_lookup(PyObject
*self
, PyObject
*args
)
70 if (!PyArg_ParseTuple(args
, "s:lookup", &encoding
))
73 return _PyCodec_Lookup(encoding
);
76 PyDoc_STRVAR(encode__doc__
,
77 "encode(obj, [encoding[,errors]]) -> object\n\
79 Encodes obj using the codec registered for encoding. encoding defaults\n\
80 to the default encoding. errors may be given to set a different error\n\
81 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
82 a ValueError. Other possible values are 'ignore', 'replace' and\n\
83 'xmlcharrefreplace' as well as any other name registered with\n\
84 codecs.register_error that can handle ValueErrors.");
87 codec_encode(PyObject
*self
, PyObject
*args
)
89 const char *encoding
= NULL
;
90 const char *errors
= NULL
;
93 if (!PyArg_ParseTuple(args
, "O|ss:encode", &v
, &encoding
, &errors
))
96 #ifdef Py_USING_UNICODE
98 encoding
= PyUnicode_GetDefaultEncoding();
100 if (encoding
== NULL
) {
101 PyErr_SetString(PyExc_ValueError
, "no encoding specified");
106 /* Encode via the codec registry */
107 return PyCodec_Encode(v
, encoding
, errors
);
110 PyDoc_STRVAR(decode__doc__
,
111 "decode(obj, [encoding[,errors]]) -> object\n\
113 Decodes obj using the codec registered for encoding. encoding defaults\n\
114 to the default encoding. errors may be given to set a different error\n\
115 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
116 a ValueError. Other possible values are 'ignore' and 'replace'\n\
117 as well as any other name registerd with codecs.register_error that is\n\
118 able to handle ValueErrors.");
121 codec_decode(PyObject
*self
, PyObject
*args
)
123 const char *encoding
= NULL
;
124 const char *errors
= NULL
;
127 if (!PyArg_ParseTuple(args
, "O|ss:decode", &v
, &encoding
, &errors
))
130 #ifdef Py_USING_UNICODE
131 if (encoding
== NULL
)
132 encoding
= PyUnicode_GetDefaultEncoding();
134 if (encoding
== NULL
) {
135 PyErr_SetString(PyExc_ValueError
, "no encoding specified");
140 /* Decode via the codec registry */
141 return PyCodec_Decode(v
, encoding
, errors
);
144 /* --- Helpers ------------------------------------------------------------ */
147 PyObject
*codec_tuple(PyObject
*unicode
,
153 v
= Py_BuildValue("On", unicode
, len
);
158 /* --- String codecs ------------------------------------------------------ */
160 escape_decode(PyObject
*self
,
163 const char *errors
= NULL
;
167 if (!PyArg_ParseTuple(args
, "s#|z:escape_decode",
168 &data
, &size
, &errors
))
170 return codec_tuple(PyString_DecodeEscape(data
, size
, errors
, 0, NULL
),
175 escape_encode(PyObject
*self
,
179 const char *errors
= NULL
;
183 if (!PyArg_ParseTuple(args
, "O!|z:escape_encode",
184 &PyString_Type
, &str
, &errors
))
187 str
= PyString_Repr(str
, 0);
191 /* The string will be quoted. Unquote, similar to unicode-escape. */
192 buf
= PyString_AS_STRING (str
);
193 len
= PyString_GET_SIZE (str
);
194 memmove(buf
, buf
+1, len
-2);
195 _PyString_Resize(&str
, len
-2);
197 return codec_tuple(str
, PyString_Size(str
));
200 #ifdef Py_USING_UNICODE
201 /* --- Decoder ------------------------------------------------------------ */
204 unicode_internal_decode(PyObject
*self
,
208 const char *errors
= NULL
;
212 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_decode",
216 if (PyUnicode_Check(obj
)) {
218 return codec_tuple(obj
, PyUnicode_GET_SIZE(obj
));
221 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
224 return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data
, size
, errors
),
230 utf_7_decode(PyObject
*self
,
235 const char *errors
= NULL
;
237 if (!PyArg_ParseTuple(args
, "t#|z:utf_7_decode",
238 &data
, &size
, &errors
))
241 return codec_tuple(PyUnicode_DecodeUTF7(data
, size
, errors
),
246 utf_8_decode(PyObject
*self
,
251 const char *errors
= NULL
;
254 PyObject
*decoded
= NULL
;
256 if (!PyArg_ParseTuple(args
, "t#|zi:utf_8_decode",
257 &data
, &size
, &errors
, &final
))
260 PyErr_SetString(PyExc_ValueError
, "negative argument");
265 decoded
= PyUnicode_DecodeUTF8Stateful(data
, size
, errors
,
266 final
? NULL
: &consumed
);
269 return codec_tuple(decoded
, consumed
);
273 utf_16_decode(PyObject
*self
,
278 const char *errors
= NULL
;
284 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_decode",
285 &data
, &size
, &errors
, &final
))
288 PyErr_SetString(PyExc_ValueError
, "negative argument");
291 consumed
= size
; /* This is overwritten unless final is true. */
292 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
, &byteorder
,
293 final
? NULL
: &consumed
);
296 return codec_tuple(decoded
, consumed
);
300 utf_16_le_decode(PyObject
*self
,
305 const char *errors
= NULL
;
309 PyObject
*decoded
= NULL
;
311 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_le_decode",
312 &data
, &size
, &errors
, &final
))
316 PyErr_SetString(PyExc_ValueError
, "negative argument");
319 consumed
= size
; /* This is overwritten unless final is true. */
320 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
,
321 &byteorder
, final
? NULL
: &consumed
);
324 return codec_tuple(decoded
, consumed
);
329 utf_16_be_decode(PyObject
*self
,
334 const char *errors
= NULL
;
338 PyObject
*decoded
= NULL
;
340 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_be_decode",
341 &data
, &size
, &errors
, &final
))
344 PyErr_SetString(PyExc_ValueError
, "negative argument");
347 consumed
= size
; /* This is overwritten unless final is true. */
348 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
,
349 &byteorder
, final
? NULL
: &consumed
);
352 return codec_tuple(decoded
, consumed
);
355 /* This non-standard version also provides access to the byteorder
356 parameter of the builtin UTF-16 codec.
358 It returns a tuple (unicode, bytesread, byteorder) with byteorder
359 being the value in effect at the end of data.
364 utf_16_ex_decode(PyObject
*self
,
369 const char *errors
= NULL
;
371 PyObject
*unicode
, *tuple
;
375 if (!PyArg_ParseTuple(args
, "t#|zii:utf_16_ex_decode",
376 &data
, &size
, &errors
, &byteorder
, &final
))
379 PyErr_SetString(PyExc_ValueError
, "negative argument");
382 consumed
= size
; /* This is overwritten unless final is true. */
383 unicode
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
, &byteorder
,
384 final
? NULL
: &consumed
);
387 tuple
= Py_BuildValue("Oni", unicode
, consumed
, byteorder
);
393 unicode_escape_decode(PyObject
*self
,
398 const char *errors
= NULL
;
400 if (!PyArg_ParseTuple(args
, "t#|z:unicode_escape_decode",
401 &data
, &size
, &errors
))
404 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data
, size
, errors
),
409 raw_unicode_escape_decode(PyObject
*self
,
414 const char *errors
= NULL
;
416 if (!PyArg_ParseTuple(args
, "t#|z:raw_unicode_escape_decode",
417 &data
, &size
, &errors
))
420 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data
, size
, errors
),
425 latin_1_decode(PyObject
*self
,
430 const char *errors
= NULL
;
432 if (!PyArg_ParseTuple(args
, "t#|z:latin_1_decode",
433 &data
, &size
, &errors
))
436 return codec_tuple(PyUnicode_DecodeLatin1(data
, size
, errors
),
441 ascii_decode(PyObject
*self
,
446 const char *errors
= NULL
;
448 if (!PyArg_ParseTuple(args
, "t#|z:ascii_decode",
449 &data
, &size
, &errors
))
452 return codec_tuple(PyUnicode_DecodeASCII(data
, size
, errors
),
457 charmap_decode(PyObject
*self
,
462 const char *errors
= NULL
;
463 PyObject
*mapping
= NULL
;
465 if (!PyArg_ParseTuple(args
, "t#|zO:charmap_decode",
466 &data
, &size
, &errors
, &mapping
))
468 if (mapping
== Py_None
)
471 return codec_tuple(PyUnicode_DecodeCharmap(data
, size
, mapping
, errors
),
475 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
478 mbcs_decode(PyObject
*self
,
482 Py_ssize_t size
, consumed
;
483 const char *errors
= NULL
;
487 if (!PyArg_ParseTuple(args
, "t#|zi:mbcs_decode",
488 &data
, &size
, &errors
, &final
))
491 decoded
= PyUnicode_DecodeMBCSStateful(
492 data
, size
, errors
, final
? NULL
: &consumed
);
495 return codec_tuple(decoded
, final
? size
: consumed
);
498 #endif /* MS_WINDOWS */
500 /* --- Encoder ------------------------------------------------------------ */
503 readbuffer_encode(PyObject
*self
,
508 const char *errors
= NULL
;
510 if (!PyArg_ParseTuple(args
, "s#|z:readbuffer_encode",
511 &data
, &size
, &errors
))
514 return codec_tuple(PyString_FromStringAndSize(data
, size
),
519 charbuffer_encode(PyObject
*self
,
524 const char *errors
= NULL
;
526 if (!PyArg_ParseTuple(args
, "t#|z:charbuffer_encode",
527 &data
, &size
, &errors
))
530 return codec_tuple(PyString_FromStringAndSize(data
, size
),
535 unicode_internal_encode(PyObject
*self
,
539 const char *errors
= NULL
;
543 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_encode",
547 if (PyUnicode_Check(obj
)) {
548 data
= PyUnicode_AS_DATA(obj
);
549 size
= PyUnicode_GET_DATA_SIZE(obj
);
550 return codec_tuple(PyString_FromStringAndSize(data
, size
),
554 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
556 return codec_tuple(PyString_FromStringAndSize(data
, size
),
562 utf_7_encode(PyObject
*self
,
566 const char *errors
= NULL
;
568 if (!PyArg_ParseTuple(args
, "O|z:utf_7_encode",
572 str
= PyUnicode_FromObject(str
);
575 v
= codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str
),
576 PyUnicode_GET_SIZE(str
),
580 PyUnicode_GET_SIZE(str
));
586 utf_8_encode(PyObject
*self
,
590 const char *errors
= NULL
;
592 if (!PyArg_ParseTuple(args
, "O|z:utf_8_encode",
596 str
= PyUnicode_FromObject(str
);
599 v
= codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str
),
600 PyUnicode_GET_SIZE(str
),
602 PyUnicode_GET_SIZE(str
));
607 /* This version provides access to the byteorder parameter of the
608 builtin UTF-16 codecs as optional third argument. It defaults to 0
609 which means: use the native byte order and prepend the data with a
615 utf_16_encode(PyObject
*self
,
619 const char *errors
= NULL
;
622 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_encode",
623 &str
, &errors
, &byteorder
))
626 str
= PyUnicode_FromObject(str
);
629 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
630 PyUnicode_GET_SIZE(str
),
633 PyUnicode_GET_SIZE(str
));
639 utf_16_le_encode(PyObject
*self
,
643 const char *errors
= NULL
;
645 if (!PyArg_ParseTuple(args
, "O|z:utf_16_le_encode",
649 str
= PyUnicode_FromObject(str
);
652 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
653 PyUnicode_GET_SIZE(str
),
656 PyUnicode_GET_SIZE(str
));
662 utf_16_be_encode(PyObject
*self
,
666 const char *errors
= NULL
;
668 if (!PyArg_ParseTuple(args
, "O|z:utf_16_be_encode",
672 str
= PyUnicode_FromObject(str
);
675 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
676 PyUnicode_GET_SIZE(str
),
679 PyUnicode_GET_SIZE(str
));
685 unicode_escape_encode(PyObject
*self
,
689 const char *errors
= NULL
;
691 if (!PyArg_ParseTuple(args
, "O|z:unicode_escape_encode",
695 str
= PyUnicode_FromObject(str
);
698 v
= codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str
),
699 PyUnicode_GET_SIZE(str
)),
700 PyUnicode_GET_SIZE(str
));
706 raw_unicode_escape_encode(PyObject
*self
,
710 const char *errors
= NULL
;
712 if (!PyArg_ParseTuple(args
, "O|z:raw_unicode_escape_encode",
716 str
= PyUnicode_FromObject(str
);
719 v
= codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
720 PyUnicode_AS_UNICODE(str
),
721 PyUnicode_GET_SIZE(str
)),
722 PyUnicode_GET_SIZE(str
));
728 latin_1_encode(PyObject
*self
,
732 const char *errors
= NULL
;
734 if (!PyArg_ParseTuple(args
, "O|z:latin_1_encode",
738 str
= PyUnicode_FromObject(str
);
741 v
= codec_tuple(PyUnicode_EncodeLatin1(
742 PyUnicode_AS_UNICODE(str
),
743 PyUnicode_GET_SIZE(str
),
745 PyUnicode_GET_SIZE(str
));
751 ascii_encode(PyObject
*self
,
755 const char *errors
= NULL
;
757 if (!PyArg_ParseTuple(args
, "O|z:ascii_encode",
761 str
= PyUnicode_FromObject(str
);
764 v
= codec_tuple(PyUnicode_EncodeASCII(
765 PyUnicode_AS_UNICODE(str
),
766 PyUnicode_GET_SIZE(str
),
768 PyUnicode_GET_SIZE(str
));
774 charmap_encode(PyObject
*self
,
778 const char *errors
= NULL
;
779 PyObject
*mapping
= NULL
;
781 if (!PyArg_ParseTuple(args
, "O|zO:charmap_encode",
782 &str
, &errors
, &mapping
))
784 if (mapping
== Py_None
)
787 str
= PyUnicode_FromObject(str
);
790 v
= codec_tuple(PyUnicode_EncodeCharmap(
791 PyUnicode_AS_UNICODE(str
),
792 PyUnicode_GET_SIZE(str
),
795 PyUnicode_GET_SIZE(str
));
801 charmap_build(PyObject
*self
, PyObject
*args
)
804 if (!PyArg_ParseTuple(args
, "U:charmap_build", &map
))
806 return PyUnicode_BuildEncodingMap(map
);
809 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
812 mbcs_encode(PyObject
*self
,
816 const char *errors
= NULL
;
818 if (!PyArg_ParseTuple(args
, "O|z:mbcs_encode",
822 str
= PyUnicode_FromObject(str
);
825 v
= codec_tuple(PyUnicode_EncodeMBCS(
826 PyUnicode_AS_UNICODE(str
),
827 PyUnicode_GET_SIZE(str
),
829 PyUnicode_GET_SIZE(str
));
834 #endif /* MS_WINDOWS */
835 #endif /* Py_USING_UNICODE */
837 /* --- Error handler registry --------------------------------------------- */
839 PyDoc_STRVAR(register_error__doc__
,
840 "register_error(errors, handler)\n\
842 Register the specified error handler under the name\n\
843 errors. handler must be a callable object, that\n\
844 will be called with an exception instance containing\n\
845 information about the location of the encoding/decoding\n\
846 error and must return a (replacement, new position) tuple.");
848 static PyObject
*register_error(PyObject
*self
, PyObject
*args
)
853 if (!PyArg_ParseTuple(args
, "sO:register_error",
856 if (PyCodec_RegisterError(name
, handler
))
861 PyDoc_STRVAR(lookup_error__doc__
,
862 "lookup_error(errors) -> handler\n\
864 Return the error handler for the specified error handling name\n\
865 or raise a LookupError, if no handler exists under this name.");
867 static PyObject
*lookup_error(PyObject
*self
, PyObject
*args
)
871 if (!PyArg_ParseTuple(args
, "s:lookup_error",
874 return PyCodec_LookupError(name
);
877 /* --- Module API --------------------------------------------------------- */
879 static PyMethodDef _codecs_functions
[] = {
880 {"register", codec_register
, METH_O
,
882 {"lookup", codec_lookup
, METH_VARARGS
,
884 {"encode", codec_encode
, METH_VARARGS
,
886 {"decode", codec_decode
, METH_VARARGS
,
888 {"escape_encode", escape_encode
, METH_VARARGS
},
889 {"escape_decode", escape_decode
, METH_VARARGS
},
890 #ifdef Py_USING_UNICODE
891 {"utf_8_encode", utf_8_encode
, METH_VARARGS
},
892 {"utf_8_decode", utf_8_decode
, METH_VARARGS
},
893 {"utf_7_encode", utf_7_encode
, METH_VARARGS
},
894 {"utf_7_decode", utf_7_decode
, METH_VARARGS
},
895 {"utf_16_encode", utf_16_encode
, METH_VARARGS
},
896 {"utf_16_le_encode", utf_16_le_encode
, METH_VARARGS
},
897 {"utf_16_be_encode", utf_16_be_encode
, METH_VARARGS
},
898 {"utf_16_decode", utf_16_decode
, METH_VARARGS
},
899 {"utf_16_le_decode", utf_16_le_decode
, METH_VARARGS
},
900 {"utf_16_be_decode", utf_16_be_decode
, METH_VARARGS
},
901 {"utf_16_ex_decode", utf_16_ex_decode
, METH_VARARGS
},
902 {"unicode_escape_encode", unicode_escape_encode
, METH_VARARGS
},
903 {"unicode_escape_decode", unicode_escape_decode
, METH_VARARGS
},
904 {"unicode_internal_encode", unicode_internal_encode
, METH_VARARGS
},
905 {"unicode_internal_decode", unicode_internal_decode
, METH_VARARGS
},
906 {"raw_unicode_escape_encode", raw_unicode_escape_encode
, METH_VARARGS
},
907 {"raw_unicode_escape_decode", raw_unicode_escape_decode
, METH_VARARGS
},
908 {"latin_1_encode", latin_1_encode
, METH_VARARGS
},
909 {"latin_1_decode", latin_1_decode
, METH_VARARGS
},
910 {"ascii_encode", ascii_encode
, METH_VARARGS
},
911 {"ascii_decode", ascii_decode
, METH_VARARGS
},
912 {"charmap_encode", charmap_encode
, METH_VARARGS
},
913 {"charmap_decode", charmap_decode
, METH_VARARGS
},
914 {"charmap_build", charmap_build
, METH_VARARGS
},
915 {"readbuffer_encode", readbuffer_encode
, METH_VARARGS
},
916 {"charbuffer_encode", charbuffer_encode
, METH_VARARGS
},
917 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
918 {"mbcs_encode", mbcs_encode
, METH_VARARGS
},
919 {"mbcs_decode", mbcs_decode
, METH_VARARGS
},
921 #endif /* Py_USING_UNICODE */
922 {"register_error", register_error
, METH_VARARGS
,
923 register_error__doc__
},
924 {"lookup_error", lookup_error
, METH_VARARGS
,
925 lookup_error__doc__
},
926 {NULL
, NULL
} /* sentinel */
932 Py_InitModule("_codecs", _codecs_functions
);