1 /* ------------------------------------------------------------------------
3 _codecs -- Provides access to the codec registry and the builtin
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
9 The codec registry is accessible via:
11 register(search_function) -> None
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
15 The builtin Unicode codecs use the following interface:
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
34 Copyright (c) Corporation for National Research Initiatives.
36 ------------------------------------------------------------------------ */
38 #define PY_SSIZE_T_CLEAN
41 /* --- Registry ----------------------------------------------------------- */
43 PyDoc_STRVAR(register__doc__
,
44 "register(search_function)\n\
46 Register a codec search function. Search functions are expected to take\n\
47 one argument, the encoding name in all lower case letters, and return\n\
48 a tuple of functions (encoder, decoder, stream_reader, stream_writer).");
51 PyObject
*codec_register(PyObject
*self
, PyObject
*search_function
)
53 if (PyCodec_Register(search_function
))
59 PyDoc_STRVAR(lookup__doc__
,
60 "lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)\n\
62 Looks up a codec tuple in the Python codec registry and returns\n\
63 a tuple of functions.");
66 PyObject
*codec_lookup(PyObject
*self
, PyObject
*args
)
70 if (!PyArg_ParseTuple(args
, "s:lookup", &encoding
))
73 return _PyCodec_Lookup(encoding
);
76 PyDoc_STRVAR(encode__doc__
,
77 "encode(obj, [encoding[,errors]]) -> object\n\
79 Encodes obj using the codec registered for encoding. encoding defaults\n\
80 to the default encoding. errors may be given to set a different error\n\
81 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
82 a ValueError. Other possible values are 'ignore', 'replace' and\n\
83 'xmlcharrefreplace' as well as any other name registered with\n\
84 codecs.register_error that can handle ValueErrors.");
87 codec_encode(PyObject
*self
, PyObject
*args
)
89 const char *encoding
= NULL
;
90 const char *errors
= NULL
;
93 if (!PyArg_ParseTuple(args
, "O|ss:encode", &v
, &encoding
, &errors
))
96 #ifdef Py_USING_UNICODE
98 encoding
= PyUnicode_GetDefaultEncoding();
100 if (encoding
== NULL
) {
101 PyErr_SetString(PyExc_ValueError
, "no encoding specified");
106 /* Encode via the codec registry */
107 return PyCodec_Encode(v
, encoding
, errors
);
110 PyDoc_STRVAR(decode__doc__
,
111 "decode(obj, [encoding[,errors]]) -> object\n\
113 Decodes obj using the codec registered for encoding. encoding defaults\n\
114 to the default encoding. errors may be given to set a different error\n\
115 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
116 a ValueError. Other possible values are 'ignore' and 'replace'\n\
117 as well as any other name registerd with codecs.register_error that is\n\
118 able to handle ValueErrors.");
121 codec_decode(PyObject
*self
, PyObject
*args
)
123 const char *encoding
= NULL
;
124 const char *errors
= NULL
;
127 if (!PyArg_ParseTuple(args
, "O|ss:decode", &v
, &encoding
, &errors
))
130 #ifdef Py_USING_UNICODE
131 if (encoding
== NULL
)
132 encoding
= PyUnicode_GetDefaultEncoding();
134 if (encoding
== NULL
) {
135 PyErr_SetString(PyExc_ValueError
, "no encoding specified");
140 /* Decode via the codec registry */
141 return PyCodec_Decode(v
, encoding
, errors
);
144 /* --- Helpers ------------------------------------------------------------ */
147 PyObject
*codec_tuple(PyObject
*unicode
,
153 v
= Py_BuildValue("On", unicode
, len
);
158 /* --- String codecs ------------------------------------------------------ */
160 escape_decode(PyObject
*self
,
163 const char *errors
= NULL
;
167 if (!PyArg_ParseTuple(args
, "s#|z:escape_decode",
168 &data
, &size
, &errors
))
170 return codec_tuple(PyString_DecodeEscape(data
, size
, errors
, 0, NULL
),
175 escape_encode(PyObject
*self
,
179 const char *errors
= NULL
;
183 if (!PyArg_ParseTuple(args
, "O!|z:escape_encode",
184 &PyString_Type
, &str
, &errors
))
187 str
= PyString_Repr(str
, 0);
191 /* The string will be quoted. Unquote, similar to unicode-escape. */
192 buf
= PyString_AS_STRING (str
);
193 len
= PyString_GET_SIZE (str
);
194 memmove(buf
, buf
+1, len
-2);
195 if (_PyString_Resize(&str
, len
-2) < 0)
198 return codec_tuple(str
, PyString_Size(str
));
201 #ifdef Py_USING_UNICODE
202 /* --- Decoder ------------------------------------------------------------ */
205 unicode_internal_decode(PyObject
*self
,
209 const char *errors
= NULL
;
213 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_decode",
217 if (PyUnicode_Check(obj
)) {
219 return codec_tuple(obj
, PyUnicode_GET_SIZE(obj
));
222 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
225 return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data
, size
, errors
),
231 utf_7_decode(PyObject
*self
,
236 const char *errors
= NULL
;
238 if (!PyArg_ParseTuple(args
, "t#|z:utf_7_decode",
239 &data
, &size
, &errors
))
242 return codec_tuple(PyUnicode_DecodeUTF7(data
, size
, errors
),
247 utf_8_decode(PyObject
*self
,
252 const char *errors
= NULL
;
255 PyObject
*decoded
= NULL
;
257 if (!PyArg_ParseTuple(args
, "t#|zi:utf_8_decode",
258 &data
, &size
, &errors
, &final
))
261 PyErr_SetString(PyExc_ValueError
, "negative argument");
266 decoded
= PyUnicode_DecodeUTF8Stateful(data
, size
, errors
,
267 final
? NULL
: &consumed
);
270 return codec_tuple(decoded
, consumed
);
274 utf_16_decode(PyObject
*self
,
279 const char *errors
= NULL
;
285 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_decode",
286 &data
, &size
, &errors
, &final
))
289 PyErr_SetString(PyExc_ValueError
, "negative argument");
292 consumed
= size
; /* This is overwritten unless final is true. */
293 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
, &byteorder
,
294 final
? NULL
: &consumed
);
297 return codec_tuple(decoded
, consumed
);
301 utf_16_le_decode(PyObject
*self
,
306 const char *errors
= NULL
;
310 PyObject
*decoded
= NULL
;
312 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_le_decode",
313 &data
, &size
, &errors
, &final
))
317 PyErr_SetString(PyExc_ValueError
, "negative argument");
320 consumed
= size
; /* This is overwritten unless final is true. */
321 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
,
322 &byteorder
, final
? NULL
: &consumed
);
325 return codec_tuple(decoded
, consumed
);
330 utf_16_be_decode(PyObject
*self
,
335 const char *errors
= NULL
;
339 PyObject
*decoded
= NULL
;
341 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_be_decode",
342 &data
, &size
, &errors
, &final
))
345 PyErr_SetString(PyExc_ValueError
, "negative argument");
348 consumed
= size
; /* This is overwritten unless final is true. */
349 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
,
350 &byteorder
, final
? NULL
: &consumed
);
353 return codec_tuple(decoded
, consumed
);
356 /* This non-standard version also provides access to the byteorder
357 parameter of the builtin UTF-16 codec.
359 It returns a tuple (unicode, bytesread, byteorder) with byteorder
360 being the value in effect at the end of data.
365 utf_16_ex_decode(PyObject
*self
,
370 const char *errors
= NULL
;
372 PyObject
*unicode
, *tuple
;
376 if (!PyArg_ParseTuple(args
, "t#|zii:utf_16_ex_decode",
377 &data
, &size
, &errors
, &byteorder
, &final
))
380 PyErr_SetString(PyExc_ValueError
, "negative argument");
383 consumed
= size
; /* This is overwritten unless final is true. */
384 unicode
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
, &byteorder
,
385 final
? NULL
: &consumed
);
388 tuple
= Py_BuildValue("Oni", unicode
, consumed
, byteorder
);
394 unicode_escape_decode(PyObject
*self
,
399 const char *errors
= NULL
;
401 if (!PyArg_ParseTuple(args
, "t#|z:unicode_escape_decode",
402 &data
, &size
, &errors
))
405 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data
, size
, errors
),
410 raw_unicode_escape_decode(PyObject
*self
,
415 const char *errors
= NULL
;
417 if (!PyArg_ParseTuple(args
, "t#|z:raw_unicode_escape_decode",
418 &data
, &size
, &errors
))
421 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data
, size
, errors
),
426 latin_1_decode(PyObject
*self
,
431 const char *errors
= NULL
;
433 if (!PyArg_ParseTuple(args
, "t#|z:latin_1_decode",
434 &data
, &size
, &errors
))
437 return codec_tuple(PyUnicode_DecodeLatin1(data
, size
, errors
),
442 ascii_decode(PyObject
*self
,
447 const char *errors
= NULL
;
449 if (!PyArg_ParseTuple(args
, "t#|z:ascii_decode",
450 &data
, &size
, &errors
))
453 return codec_tuple(PyUnicode_DecodeASCII(data
, size
, errors
),
458 charmap_decode(PyObject
*self
,
463 const char *errors
= NULL
;
464 PyObject
*mapping
= NULL
;
466 if (!PyArg_ParseTuple(args
, "t#|zO:charmap_decode",
467 &data
, &size
, &errors
, &mapping
))
469 if (mapping
== Py_None
)
472 return codec_tuple(PyUnicode_DecodeCharmap(data
, size
, mapping
, errors
),
476 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
479 mbcs_decode(PyObject
*self
,
483 Py_ssize_t size
, consumed
;
484 const char *errors
= NULL
;
488 if (!PyArg_ParseTuple(args
, "t#|zi:mbcs_decode",
489 &data
, &size
, &errors
, &final
))
492 decoded
= PyUnicode_DecodeMBCSStateful(
493 data
, size
, errors
, final
? NULL
: &consumed
);
496 return codec_tuple(decoded
, final
? size
: consumed
);
499 #endif /* MS_WINDOWS */
501 /* --- Encoder ------------------------------------------------------------ */
504 readbuffer_encode(PyObject
*self
,
509 const char *errors
= NULL
;
511 if (!PyArg_ParseTuple(args
, "s#|z:readbuffer_encode",
512 &data
, &size
, &errors
))
515 return codec_tuple(PyString_FromStringAndSize(data
, size
),
520 charbuffer_encode(PyObject
*self
,
525 const char *errors
= NULL
;
527 if (!PyArg_ParseTuple(args
, "t#|z:charbuffer_encode",
528 &data
, &size
, &errors
))
531 return codec_tuple(PyString_FromStringAndSize(data
, size
),
536 unicode_internal_encode(PyObject
*self
,
540 const char *errors
= NULL
;
544 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_encode",
548 if (PyUnicode_Check(obj
)) {
549 data
= PyUnicode_AS_DATA(obj
);
550 size
= PyUnicode_GET_DATA_SIZE(obj
);
551 return codec_tuple(PyString_FromStringAndSize(data
, size
),
555 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
557 return codec_tuple(PyString_FromStringAndSize(data
, size
),
563 utf_7_encode(PyObject
*self
,
567 const char *errors
= NULL
;
569 if (!PyArg_ParseTuple(args
, "O|z:utf_7_encode",
573 str
= PyUnicode_FromObject(str
);
576 v
= codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str
),
577 PyUnicode_GET_SIZE(str
),
581 PyUnicode_GET_SIZE(str
));
587 utf_8_encode(PyObject
*self
,
591 const char *errors
= NULL
;
593 if (!PyArg_ParseTuple(args
, "O|z:utf_8_encode",
597 str
= PyUnicode_FromObject(str
);
600 v
= codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str
),
601 PyUnicode_GET_SIZE(str
),
603 PyUnicode_GET_SIZE(str
));
608 /* This version provides access to the byteorder parameter of the
609 builtin UTF-16 codecs as optional third argument. It defaults to 0
610 which means: use the native byte order and prepend the data with a
616 utf_16_encode(PyObject
*self
,
620 const char *errors
= NULL
;
623 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_encode",
624 &str
, &errors
, &byteorder
))
627 str
= PyUnicode_FromObject(str
);
630 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
631 PyUnicode_GET_SIZE(str
),
634 PyUnicode_GET_SIZE(str
));
640 utf_16_le_encode(PyObject
*self
,
644 const char *errors
= NULL
;
646 if (!PyArg_ParseTuple(args
, "O|z:utf_16_le_encode",
650 str
= PyUnicode_FromObject(str
);
653 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
654 PyUnicode_GET_SIZE(str
),
657 PyUnicode_GET_SIZE(str
));
663 utf_16_be_encode(PyObject
*self
,
667 const char *errors
= NULL
;
669 if (!PyArg_ParseTuple(args
, "O|z:utf_16_be_encode",
673 str
= PyUnicode_FromObject(str
);
676 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
677 PyUnicode_GET_SIZE(str
),
680 PyUnicode_GET_SIZE(str
));
686 unicode_escape_encode(PyObject
*self
,
690 const char *errors
= NULL
;
692 if (!PyArg_ParseTuple(args
, "O|z:unicode_escape_encode",
696 str
= PyUnicode_FromObject(str
);
699 v
= codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str
),
700 PyUnicode_GET_SIZE(str
)),
701 PyUnicode_GET_SIZE(str
));
707 raw_unicode_escape_encode(PyObject
*self
,
711 const char *errors
= NULL
;
713 if (!PyArg_ParseTuple(args
, "O|z:raw_unicode_escape_encode",
717 str
= PyUnicode_FromObject(str
);
720 v
= codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
721 PyUnicode_AS_UNICODE(str
),
722 PyUnicode_GET_SIZE(str
)),
723 PyUnicode_GET_SIZE(str
));
729 latin_1_encode(PyObject
*self
,
733 const char *errors
= NULL
;
735 if (!PyArg_ParseTuple(args
, "O|z:latin_1_encode",
739 str
= PyUnicode_FromObject(str
);
742 v
= codec_tuple(PyUnicode_EncodeLatin1(
743 PyUnicode_AS_UNICODE(str
),
744 PyUnicode_GET_SIZE(str
),
746 PyUnicode_GET_SIZE(str
));
752 ascii_encode(PyObject
*self
,
756 const char *errors
= NULL
;
758 if (!PyArg_ParseTuple(args
, "O|z:ascii_encode",
762 str
= PyUnicode_FromObject(str
);
765 v
= codec_tuple(PyUnicode_EncodeASCII(
766 PyUnicode_AS_UNICODE(str
),
767 PyUnicode_GET_SIZE(str
),
769 PyUnicode_GET_SIZE(str
));
775 charmap_encode(PyObject
*self
,
779 const char *errors
= NULL
;
780 PyObject
*mapping
= NULL
;
782 if (!PyArg_ParseTuple(args
, "O|zO:charmap_encode",
783 &str
, &errors
, &mapping
))
785 if (mapping
== Py_None
)
788 str
= PyUnicode_FromObject(str
);
791 v
= codec_tuple(PyUnicode_EncodeCharmap(
792 PyUnicode_AS_UNICODE(str
),
793 PyUnicode_GET_SIZE(str
),
796 PyUnicode_GET_SIZE(str
));
802 charmap_build(PyObject
*self
, PyObject
*args
)
805 if (!PyArg_ParseTuple(args
, "U:charmap_build", &map
))
807 return PyUnicode_BuildEncodingMap(map
);
810 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
813 mbcs_encode(PyObject
*self
,
817 const char *errors
= NULL
;
819 if (!PyArg_ParseTuple(args
, "O|z:mbcs_encode",
823 str
= PyUnicode_FromObject(str
);
826 v
= codec_tuple(PyUnicode_EncodeMBCS(
827 PyUnicode_AS_UNICODE(str
),
828 PyUnicode_GET_SIZE(str
),
830 PyUnicode_GET_SIZE(str
));
835 #endif /* MS_WINDOWS */
836 #endif /* Py_USING_UNICODE */
838 /* --- Error handler registry --------------------------------------------- */
840 PyDoc_STRVAR(register_error__doc__
,
841 "register_error(errors, handler)\n\
843 Register the specified error handler under the name\n\
844 errors. handler must be a callable object, that\n\
845 will be called with an exception instance containing\n\
846 information about the location of the encoding/decoding\n\
847 error and must return a (replacement, new position) tuple.");
849 static PyObject
*register_error(PyObject
*self
, PyObject
*args
)
854 if (!PyArg_ParseTuple(args
, "sO:register_error",
857 if (PyCodec_RegisterError(name
, handler
))
862 PyDoc_STRVAR(lookup_error__doc__
,
863 "lookup_error(errors) -> handler\n\
865 Return the error handler for the specified error handling name\n\
866 or raise a LookupError, if no handler exists under this name.");
868 static PyObject
*lookup_error(PyObject
*self
, PyObject
*args
)
872 if (!PyArg_ParseTuple(args
, "s:lookup_error",
875 return PyCodec_LookupError(name
);
878 /* --- Module API --------------------------------------------------------- */
880 static PyMethodDef _codecs_functions
[] = {
881 {"register", codec_register
, METH_O
,
883 {"lookup", codec_lookup
, METH_VARARGS
,
885 {"encode", codec_encode
, METH_VARARGS
,
887 {"decode", codec_decode
, METH_VARARGS
,
889 {"escape_encode", escape_encode
, METH_VARARGS
},
890 {"escape_decode", escape_decode
, METH_VARARGS
},
891 #ifdef Py_USING_UNICODE
892 {"utf_8_encode", utf_8_encode
, METH_VARARGS
},
893 {"utf_8_decode", utf_8_decode
, METH_VARARGS
},
894 {"utf_7_encode", utf_7_encode
, METH_VARARGS
},
895 {"utf_7_decode", utf_7_decode
, METH_VARARGS
},
896 {"utf_16_encode", utf_16_encode
, METH_VARARGS
},
897 {"utf_16_le_encode", utf_16_le_encode
, METH_VARARGS
},
898 {"utf_16_be_encode", utf_16_be_encode
, METH_VARARGS
},
899 {"utf_16_decode", utf_16_decode
, METH_VARARGS
},
900 {"utf_16_le_decode", utf_16_le_decode
, METH_VARARGS
},
901 {"utf_16_be_decode", utf_16_be_decode
, METH_VARARGS
},
902 {"utf_16_ex_decode", utf_16_ex_decode
, METH_VARARGS
},
903 {"unicode_escape_encode", unicode_escape_encode
, METH_VARARGS
},
904 {"unicode_escape_decode", unicode_escape_decode
, METH_VARARGS
},
905 {"unicode_internal_encode", unicode_internal_encode
, METH_VARARGS
},
906 {"unicode_internal_decode", unicode_internal_decode
, METH_VARARGS
},
907 {"raw_unicode_escape_encode", raw_unicode_escape_encode
, METH_VARARGS
},
908 {"raw_unicode_escape_decode", raw_unicode_escape_decode
, METH_VARARGS
},
909 {"latin_1_encode", latin_1_encode
, METH_VARARGS
},
910 {"latin_1_decode", latin_1_decode
, METH_VARARGS
},
911 {"ascii_encode", ascii_encode
, METH_VARARGS
},
912 {"ascii_decode", ascii_decode
, METH_VARARGS
},
913 {"charmap_encode", charmap_encode
, METH_VARARGS
},
914 {"charmap_decode", charmap_decode
, METH_VARARGS
},
915 {"charmap_build", charmap_build
, METH_VARARGS
},
916 {"readbuffer_encode", readbuffer_encode
, METH_VARARGS
},
917 {"charbuffer_encode", charbuffer_encode
, METH_VARARGS
},
918 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
919 {"mbcs_encode", mbcs_encode
, METH_VARARGS
},
920 {"mbcs_decode", mbcs_decode
, METH_VARARGS
},
922 #endif /* Py_USING_UNICODE */
923 {"register_error", register_error
, METH_VARARGS
,
924 register_error__doc__
},
925 {"lookup_error", lookup_error
, METH_VARARGS
,
926 lookup_error__doc__
},
927 {NULL
, NULL
} /* sentinel */
933 Py_InitModule("_codecs", _codecs_functions
);