Modules/_codecsmodule.c

   1 /* ------------------------------------------------------------------------
   2
   3    _codecs -- Provides access to the codec registry and the builtin
   4               codecs.
   5
   6    This module should never be imported directly. The standard library
   7    module "codecs" wraps this builtin module for use within Python.
   8
   9    The codec registry is accessible via:
  10
  11      register(search_function) -> None
  12
  13      lookup(encoding) -> CodecInfo object
  14
  15    The builtin Unicode codecs use the following interface:
  16
  17      <encoding>_encode(Unicode_object[,errors='strict']) ->
  18         (string object, bytes consumed)
  19
  20      <encoding>_decode(char_buffer_obj[,errors='strict']) ->
  21         (Unicode object, bytes consumed)
  22
  23    <encoding>_encode() interfaces also accept non-Unicode object as
  24    input. The objects are then converted to Unicode using
  25    PyUnicode_FromObject() prior to applying the conversion.
  26
  27    These <encoding>s are available: utf_8, unicode_escape,
  28    raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
  29    mbcs (on win32).
  30
  31
  32 Written by Marc-Andre Lemburg (mal@lemburg.com).
  33
  34 Copyright (c) Corporation for National Research Initiatives.
  35
  36    ------------------------------------------------------------------------ */
  37
  38 #define PY_SSIZE_T_CLEAN
  39 #include "Python.h"
  40
  41 /* --- Registry ----------------------------------------------------------- */
  42
  43 PyDoc_STRVAR(register__doc__,
  44 "register(search_function)\n\
  45 \n\
  46 Register a codec search function. Search functions are expected to take\n\
  47 one argument, the encoding name in all lower case letters, and return\n\
  48 a tuple of functions (encoder, decoder, stream_reader, stream_writer)\n\
  49 (or a CodecInfo object).");
  50
  51 static
  52 PyObject *codec_register(PyObject *self, PyObject *search_function)
  53 {
  54     if (PyCodec_Register(search_function))
  55         return NULL;
  56
  57     Py_RETURN_NONE;
  58 }
  59
  60 PyDoc_STRVAR(lookup__doc__,
  61 "lookup(encoding) -> CodecInfo\n\
  62 \n\
  63 Looks up a codec tuple in the Python codec registry and returns\n\
  64 a tuple of function (or a CodecInfo object).");
  65
  66 static
  67 PyObject *codec_lookup(PyObject *self, PyObject *args)
  68 {
  69     char *encoding;
  70
  71     if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
  72         return NULL;
  73
  74     return _PyCodec_Lookup(encoding);
  75 }
  76
  77 PyDoc_STRVAR(encode__doc__,
  78 "encode(obj, [encoding[,errors]]) -> object\n\
  79 \n\
  80 Encodes obj using the codec registered for encoding. encoding defaults\n\
  81 to the default encoding. errors may be given to set a different error\n\
  82 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
  83 a ValueError. Other possible values are 'ignore', 'replace' and\n\
  84 'xmlcharrefreplace' as well as any other name registered with\n\
  85 codecs.register_error that can handle ValueErrors.");
  86
  87 static PyObject *
  88 codec_encode(PyObject *self, PyObject *args)
  89 {
  90     const char *encoding = NULL;
  91     const char *errors = NULL;
  92     PyObject *v;
  93
  94     if (!PyArg_ParseTuple(args, "O|ss:encode", &v, &encoding, &errors))
  95         return NULL;
  96
  97 #ifdef Py_USING_UNICODE
  98     if (encoding == NULL)
  99         encoding = PyUnicode_GetDefaultEncoding();
 100 #else
 101     if (encoding == NULL) {
 102         PyErr_SetString(PyExc_ValueError, "no encoding specified");
 103         return NULL;
 104     }
 105 #endif
 106
 107     /* Encode via the codec registry */
 108     return PyCodec_Encode(v, encoding, errors);
 109 }
 110
 111 PyDoc_STRVAR(decode__doc__,
 112 "decode(obj, [encoding[,errors]]) -> object\n\
 113 \n\
 114 Decodes obj using the codec registered for encoding. encoding defaults\n\
 115 to the default encoding. errors may be given to set a different error\n\
 116 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
 117 a ValueError. Other possible values are 'ignore' and 'replace'\n\
 118 as well as any other name registered with codecs.register_error that is\n\
 119 able to handle ValueErrors.");
 120
 121 static PyObject *
 122 codec_decode(PyObject *self, PyObject *args)
 123 {
 124     const char *encoding = NULL;
 125     const char *errors = NULL;
 126     PyObject *v;
 127
 128     if (!PyArg_ParseTuple(args, "O|ss:decode", &v, &encoding, &errors))
 129         return NULL;
 130
 131 #ifdef Py_USING_UNICODE
 132     if (encoding == NULL)
 133         encoding = PyUnicode_GetDefaultEncoding();
 134 #else
 135     if (encoding == NULL) {
 136         PyErr_SetString(PyExc_ValueError, "no encoding specified");
 137         return NULL;
 138     }
 139 #endif
 140
 141     /* Decode via the codec registry */
 142     return PyCodec_Decode(v, encoding, errors);
 143 }
 144
 145 /* --- Helpers ------------------------------------------------------------ */
 146
 147 static
 148 PyObject *codec_tuple(PyObject *unicode,
 149                       Py_ssize_t len)
 150 {
 151     PyObject *v;
 152     if (unicode == NULL)
 153         return NULL;
 154     v = Py_BuildValue("On", unicode, len);
 155     Py_DECREF(unicode);
 156     return v;
 157 }
 158
 159 /* --- String codecs ------------------------------------------------------ */
 160 static PyObject *
 161 escape_decode(PyObject *self,
 162               PyObject *args)
 163 {
 164     const char *errors = NULL;
 165     const char *data;
 166     Py_ssize_t size;
 167
 168     if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
 169                           &data, &size, &errors))
 170         return NULL;
 171     return codec_tuple(PyString_DecodeEscape(data, size, errors, 0, NULL),
 172                        size);
 173 }
 174
 175 static PyObject *
 176 escape_encode(PyObject *self,
 177               PyObject *args)
 178 {
 179         PyObject *str;
 180         const char *errors = NULL;
 181         char *buf;
 182         Py_ssize_t len;
 183
 184         if (!PyArg_ParseTuple(args, "O!|z:escape_encode",
 185                               &PyString_Type, &str, &errors))
 186                 return NULL;
 187
 188         str = PyString_Repr(str, 0);
 189         if (!str)
 190                 return NULL;
 191
 192         /* The string will be quoted. Unquote, similar to unicode-escape. */
 193         buf = PyString_AS_STRING (str);
 194         len = PyString_GET_SIZE (str);
 195         memmove(buf, buf+1, len-2);
 196         if (_PyString_Resize(&str, len-2) < 0)
 197                 return NULL;
 198
 199         return codec_tuple(str, PyString_Size(str));
 200 }
 201
 202 #ifdef Py_USING_UNICODE
 203 /* --- Decoder ------------------------------------------------------------ */
 204
 205 static PyObject *
 206 unicode_internal_decode(PyObject *self,
 207                         PyObject *args)
 208 {
 209     PyObject *obj;
 210     const char *errors = NULL;
 211     const char *data;
 212     Py_ssize_t size;
 213
 214     if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
 215                           &obj, &errors))
 216         return NULL;
 217
 218     if (PyUnicode_Check(obj)) {
 219         Py_INCREF(obj);
 220         return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
 221     }
 222     else {
 223         if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
 224             return NULL;
 225
 226         return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
 227                            size);
 228     }
 229 }
 230
 231 static PyObject *
 232 utf_7_decode(PyObject *self,
 233              PyObject *args)
 234 {
 235         Py_buffer pbuf;
 236     const char *errors = NULL;
 237     int final = 0;
 238     Py_ssize_t consumed;
 239     PyObject *decoded = NULL;
 240
 241     if (!PyArg_ParseTuple(args, "s*|zi:utf_7_decode",
 242                           &pbuf, &errors, &final))
 243         return NULL;
 244     consumed = pbuf.len;
 245
 246     decoded = PyUnicode_DecodeUTF7Stateful(pbuf.buf, pbuf.len, errors,
 247                                            final ? NULL : &consumed);
 248         PyBuffer_Release(&pbuf);
 249     if (decoded == NULL)
 250         return NULL;
 251     return codec_tuple(decoded, consumed);
 252 }
 253
 254 static PyObject *
 255 utf_8_decode(PyObject *self,
 256             PyObject *args)
 257 {
 258         Py_buffer pbuf;
 259     const char *errors = NULL;
 260     int final = 0;
 261     Py_ssize_t consumed;
 262     PyObject *decoded = NULL;
 263
 264     if (!PyArg_ParseTuple(args, "s*|zi:utf_8_decode",
 265                           &pbuf, &errors, &final))
 266         return NULL;
 267     consumed = pbuf.len;
 268
 269     decoded = PyUnicode_DecodeUTF8Stateful(pbuf.buf, pbuf.len, errors,
 270                                            final ? NULL : &consumed);
 271         PyBuffer_Release(&pbuf);
 272     if (decoded == NULL)
 273         return NULL;
 274     return codec_tuple(decoded, consumed);
 275 }
 276
 277 static PyObject *
 278 utf_16_decode(PyObject *self,
 279             PyObject *args)
 280 {
 281         Py_buffer pbuf;
 282     const char *errors = NULL;
 283     int byteorder = 0;
 284     int final = 0;
 285     Py_ssize_t consumed;
 286     PyObject *decoded;
 287
 288     if (!PyArg_ParseTuple(args, "s*|zi:utf_16_decode",
 289                           &pbuf, &errors, &final))
 290         return NULL;
 291     consumed = pbuf.len; /* This is overwritten unless final is true. */
 292     decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
 293                                         &byteorder, final ? NULL : &consumed);
 294         PyBuffer_Release(&pbuf);
 295     if (decoded == NULL)
 296         return NULL;
 297     return codec_tuple(decoded, consumed);
 298 }
 299
 300 static PyObject *
 301 utf_16_le_decode(PyObject *self,
 302                  PyObject *args)
 303 {
 304         Py_buffer pbuf;
 305     const char *errors = NULL;
 306     int byteorder = -1;
 307     int final = 0;
 308     Py_ssize_t consumed;
 309     PyObject *decoded = NULL;
 310
 311     if (!PyArg_ParseTuple(args, "s*|zi:utf_16_le_decode",
 312                           &pbuf, &errors, &final))
 313         return NULL;
 314
 315     consumed = pbuf.len; /* This is overwritten unless final is true. */
 316     decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
 317         &byteorder, final ? NULL : &consumed);
 318         PyBuffer_Release(&pbuf);
 319     if (decoded == NULL)
 320         return NULL;
 321     return codec_tuple(decoded, consumed);
 322 }
 323
 324 static PyObject *
 325 utf_16_be_decode(PyObject *self,
 326                  PyObject *args)
 327 {
 328         Py_buffer pbuf;
 329     const char *errors = NULL;
 330     int byteorder = 1;
 331     int final = 0;
 332     Py_ssize_t consumed;
 333     PyObject *decoded = NULL;
 334
 335     if (!PyArg_ParseTuple(args, "s*|zi:utf_16_be_decode",
 336                           &pbuf, &errors, &final))
 337         return NULL;
 338
 339     consumed = pbuf.len; /* This is overwritten unless final is true. */
 340     decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
 341         &byteorder, final ? NULL : &consumed);
 342         PyBuffer_Release(&pbuf);
 343     if (decoded == NULL)
 344         return NULL;
 345     return codec_tuple(decoded, consumed);
 346 }
 347
 348 /* This non-standard version also provides access to the byteorder
 349    parameter of the builtin UTF-16 codec.
 350
 351    It returns a tuple (unicode, bytesread, byteorder) with byteorder
 352    being the value in effect at the end of data.
 353
 354 */
 355
 356 static PyObject *
 357 utf_16_ex_decode(PyObject *self,
 358                  PyObject *args)
 359 {
 360         Py_buffer pbuf;
 361     const char *errors = NULL;
 362     int byteorder = 0;
 363     PyObject *unicode, *tuple;
 364     int final = 0;
 365     Py_ssize_t consumed;
 366
 367     if (!PyArg_ParseTuple(args, "s*|zii:utf_16_ex_decode",
 368                           &pbuf, &errors, &byteorder, &final))
 369         return NULL;
 370     consumed = pbuf.len; /* This is overwritten unless final is true. */
 371     unicode = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
 372                                         &byteorder, final ? NULL : &consumed);
 373         PyBuffer_Release(&pbuf);
 374     if (unicode == NULL)
 375         return NULL;
 376     tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
 377     Py_DECREF(unicode);
 378     return tuple;
 379 }
 380
 381 static PyObject *
 382 utf_32_decode(PyObject *self,
 383             PyObject *args)
 384 {
 385         Py_buffer pbuf;
 386     const char *errors = NULL;
 387     int byteorder = 0;
 388     int final = 0;
 389     Py_ssize_t consumed;
 390     PyObject *decoded;
 391
 392     if (!PyArg_ParseTuple(args, "s*|zi:utf_32_decode",
 393                           &pbuf, &errors, &final))
 394         return NULL;
 395     consumed = pbuf.len; /* This is overwritten unless final is true. */
 396     decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
 397                                         &byteorder, final ? NULL : &consumed);
 398         PyBuffer_Release(&pbuf);
 399     if (decoded == NULL)
 400         return NULL;
 401     return codec_tuple(decoded, consumed);
 402 }
 403
 404 static PyObject *
 405 utf_32_le_decode(PyObject *self,
 406                  PyObject *args)
 407 {
 408         Py_buffer pbuf;
 409     const char *errors = NULL;
 410     int byteorder = -1;
 411     int final = 0;
 412     Py_ssize_t consumed;
 413     PyObject *decoded;
 414
 415     if (!PyArg_ParseTuple(args, "s*|zi:utf_32_le_decode",
 416                           &pbuf, &errors, &final))
 417         return NULL;
 418     consumed = pbuf.len; /* This is overwritten unless final is true. */
 419     decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
 420                                         &byteorder, final ? NULL : &consumed);
 421         PyBuffer_Release(&pbuf);
 422     if (decoded == NULL)
 423         return NULL;
 424     return codec_tuple(decoded, consumed);
 425 }
 426
 427 static PyObject *
 428 utf_32_be_decode(PyObject *self,
 429                  PyObject *args)
 430 {
 431         Py_buffer pbuf;
 432     const char *errors = NULL;
 433     int byteorder = 1;
 434     int final = 0;
 435     Py_ssize_t consumed;
 436     PyObject *decoded;
 437
 438     if (!PyArg_ParseTuple(args, "s*|zi:utf_32_be_decode",
 439                           &pbuf, &errors, &final))
 440         return NULL;
 441     consumed = pbuf.len; /* This is overwritten unless final is true. */
 442     decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
 443                                         &byteorder, final ? NULL : &consumed);
 444         PyBuffer_Release(&pbuf);
 445     if (decoded == NULL)
 446         return NULL;
 447     return codec_tuple(decoded, consumed);
 448 }
 449
 450 /* This non-standard version also provides access to the byteorder
 451    parameter of the builtin UTF-32 codec.
 452
 453    It returns a tuple (unicode, bytesread, byteorder) with byteorder
 454    being the value in effect at the end of data.
 455
 456 */
 457
 458 static PyObject *
 459 utf_32_ex_decode(PyObject *self,
 460                  PyObject *args)
 461 {
 462         Py_buffer pbuf;
 463     const char *errors = NULL;
 464     int byteorder = 0;
 465     PyObject *unicode, *tuple;
 466     int final = 0;
 467     Py_ssize_t consumed;
 468
 469     if (!PyArg_ParseTuple(args, "s*|zii:utf_32_ex_decode",
 470                           &pbuf, &errors, &byteorder, &final))
 471         return NULL;
 472     consumed = pbuf.len; /* This is overwritten unless final is true. */
 473     unicode = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
 474                                         &byteorder, final ? NULL : &consumed);
 475         PyBuffer_Release(&pbuf);
 476     if (unicode == NULL)
 477         return NULL;
 478     tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
 479     Py_DECREF(unicode);
 480     return tuple;
 481 }
 482
 483 static PyObject *
 484 unicode_escape_decode(PyObject *self,
 485                      PyObject *args)
 486 {
 487         Py_buffer pbuf;
 488     const char *errors = NULL;
 489         PyObject *unicode;
 490
 491     if (!PyArg_ParseTuple(args, "s*|z:unicode_escape_decode",
 492                           &pbuf, &errors))
 493         return NULL;
 494
 495         unicode = PyUnicode_DecodeUnicodeEscape(pbuf.buf, pbuf.len, errors);
 496         PyBuffer_Release(&pbuf);
 497         return codec_tuple(unicode, pbuf.len);
 498 }
 499
 500 static PyObject *
 501 raw_unicode_escape_decode(PyObject *self,
 502                         PyObject *args)
 503 {
 504         Py_buffer pbuf;
 505     const char *errors = NULL;
 506         PyObject *unicode;
 507
 508     if (!PyArg_ParseTuple(args, "s*|z:raw_unicode_escape_decode",
 509                           &pbuf, &errors))
 510         return NULL;
 511
 512         unicode = PyUnicode_DecodeRawUnicodeEscape(pbuf.buf, pbuf.len, errors);
 513         PyBuffer_Release(&pbuf);
 514         return codec_tuple(unicode, pbuf.len);
 515 }
 516
 517 static PyObject *
 518 latin_1_decode(PyObject *self,
 519                PyObject *args)
 520 {
 521         Py_buffer pbuf;
 522         PyObject *unicode;
 523     const char *errors = NULL;
 524
 525     if (!PyArg_ParseTuple(args, "s*|z:latin_1_decode",
 526                           &pbuf, &errors))
 527         return NULL;
 528
 529         unicode = PyUnicode_DecodeLatin1(pbuf.buf, pbuf.len, errors);
 530         PyBuffer_Release(&pbuf);
 531         return codec_tuple(unicode, pbuf.len);
 532 }
 533
 534 static PyObject *
 535 ascii_decode(PyObject *self,
 536              PyObject *args)
 537 {
 538         Py_buffer pbuf;
 539         PyObject *unicode;
 540     const char *errors = NULL;
 541
 542     if (!PyArg_ParseTuple(args, "s*|z:ascii_decode",
 543                           &pbuf, &errors))
 544         return NULL;
 545
 546         unicode = PyUnicode_DecodeASCII(pbuf.buf, pbuf.len, errors);
 547         PyBuffer_Release(&pbuf);
 548         return codec_tuple(unicode, pbuf.len);
 549 }
 550
 551 static PyObject *
 552 charmap_decode(PyObject *self,
 553                PyObject *args)
 554 {
 555         Py_buffer pbuf;
 556         PyObject *unicode;
 557     const char *errors = NULL;
 558     PyObject *mapping = NULL;
 559
 560     if (!PyArg_ParseTuple(args, "s*|zO:charmap_decode",
 561                           &pbuf, &errors, &mapping))
 562         return NULL;
 563     if (mapping == Py_None)
 564         mapping = NULL;
 565
 566         unicode = PyUnicode_DecodeCharmap(pbuf.buf, pbuf.len, mapping, errors);
 567         PyBuffer_Release(&pbuf);
 568         return codec_tuple(unicode, pbuf.len);
 569 }
 570
 571 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 572
 573 static PyObject *
 574 mbcs_decode(PyObject *self,
 575             PyObject *args)
 576 {
 577         Py_buffer pbuf;
 578     const char *errors = NULL;
 579     int final = 0;
 580     Py_ssize_t consumed;
 581     PyObject *decoded = NULL;
 582
 583     if (!PyArg_ParseTuple(args, "s*|zi:mbcs_decode",
 584                           &pbuf, &errors, &final))
 585         return NULL;
 586     consumed = pbuf.len;
 587
 588     decoded = PyUnicode_DecodeMBCSStateful(pbuf.buf, pbuf.len, errors,
 589                                            final ? NULL : &consumed);
 590         PyBuffer_Release(&pbuf);
 591     if (decoded == NULL)
 592         return NULL;
 593     return codec_tuple(decoded, consumed);
 594 }
 595
 596 #endif /* MS_WINDOWS */
 597
 598 /* --- Encoder ------------------------------------------------------------ */
 599
 600 static PyObject *
 601 readbuffer_encode(PyObject *self,
 602                   PyObject *args)
 603 {
 604     const char *data;
 605     Py_ssize_t size;
 606     const char *errors = NULL;
 607
 608     if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
 609                           &data, &size, &errors))
 610         return NULL;
 611
 612     return codec_tuple(PyString_FromStringAndSize(data, size),
 613                        size);
 614 }
 615
 616 static PyObject *
 617 charbuffer_encode(PyObject *self,
 618                   PyObject *args)
 619 {
 620     const char *data;
 621     Py_ssize_t size;
 622     const char *errors = NULL;
 623
 624     if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
 625                           &data, &size, &errors))
 626         return NULL;
 627
 628     return codec_tuple(PyString_FromStringAndSize(data, size),
 629                        size);
 630 }
 631
 632 static PyObject *
 633 unicode_internal_encode(PyObject *self,
 634                         PyObject *args)
 635 {
 636     PyObject *obj;
 637     const char *errors = NULL;
 638     const char *data;
 639     Py_ssize_t size;
 640
 641     if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
 642                           &obj, &errors))
 643         return NULL;
 644
 645     if (PyUnicode_Check(obj)) {
 646         data = PyUnicode_AS_DATA(obj);
 647         size = PyUnicode_GET_DATA_SIZE(obj);
 648         return codec_tuple(PyString_FromStringAndSize(data, size),
 649                            size);
 650     }
 651     else {
 652         if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
 653             return NULL;
 654         return codec_tuple(PyString_FromStringAndSize(data, size),
 655                            size);
 656     }
 657 }
 658
 659 static PyObject *
 660 utf_7_encode(PyObject *self,
 661             PyObject *args)
 662 {
 663     PyObject *str, *v;
 664     const char *errors = NULL;
 665
 666     if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
 667                           &str, &errors))
 668         return NULL;
 669
 670     str = PyUnicode_FromObject(str);
 671     if (str == NULL)
 672         return NULL;
 673     v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
 674                                          PyUnicode_GET_SIZE(str),
 675                                          0,
 676                                          0,
 677                                          errors),
 678                     PyUnicode_GET_SIZE(str));
 679     Py_DECREF(str);
 680     return v;
 681 }
 682
 683 static PyObject *
 684 utf_8_encode(PyObject *self,
 685             PyObject *args)
 686 {
 687     PyObject *str, *v;
 688     const char *errors = NULL;
 689
 690     if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
 691                           &str, &errors))
 692         return NULL;
 693
 694     str = PyUnicode_FromObject(str);
 695     if (str == NULL)
 696         return NULL;
 697     v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
 698                                          PyUnicode_GET_SIZE(str),
 699                                          errors),
 700                     PyUnicode_GET_SIZE(str));
 701     Py_DECREF(str);
 702     return v;
 703 }
 704
 705 /* This version provides access to the byteorder parameter of the
 706    builtin UTF-16 codecs as optional third argument. It defaults to 0
 707    which means: use the native byte order and prepend the data with a
 708    BOM mark.
 709
 710 */
 711
 712 static PyObject *
 713 utf_16_encode(PyObject *self,
 714             PyObject *args)
 715 {
 716     PyObject *str, *v;
 717     const char *errors = NULL;
 718     int byteorder = 0;
 719
 720     if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
 721                           &str, &errors, &byteorder))
 722         return NULL;
 723
 724     str = PyUnicode_FromObject(str);
 725     if (str == NULL)
 726         return NULL;
 727     v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
 728                                           PyUnicode_GET_SIZE(str),
 729                                           errors,
 730                                           byteorder),
 731                     PyUnicode_GET_SIZE(str));
 732     Py_DECREF(str);
 733     return v;
 734 }
 735
 736 static PyObject *
 737 utf_16_le_encode(PyObject *self,
 738                  PyObject *args)
 739 {
 740     PyObject *str, *v;
 741     const char *errors = NULL;
 742
 743     if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
 744                           &str, &errors))
 745         return NULL;
 746
 747     str = PyUnicode_FromObject(str);
 748     if (str == NULL)
 749         return NULL;
 750     v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
 751                                              PyUnicode_GET_SIZE(str),
 752                                              errors,
 753                                              -1),
 754                        PyUnicode_GET_SIZE(str));
 755     Py_DECREF(str);
 756     return v;
 757 }
 758
 759 static PyObject *
 760 utf_16_be_encode(PyObject *self,
 761                  PyObject *args)
 762 {
 763     PyObject *str, *v;
 764     const char *errors = NULL;
 765
 766     if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
 767                           &str, &errors))
 768         return NULL;
 769
 770     str = PyUnicode_FromObject(str);
 771     if (str == NULL)
 772         return NULL;
 773     v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
 774                                           PyUnicode_GET_SIZE(str),
 775                                           errors,
 776                                           +1),
 777                     PyUnicode_GET_SIZE(str));
 778     Py_DECREF(str);
 779     return v;
 780 }
 781
 782 /* This version provides access to the byteorder parameter of the
 783    builtin UTF-32 codecs as optional third argument. It defaults to 0
 784    which means: use the native byte order and prepend the data with a
 785    BOM mark.
 786
 787 */
 788
 789 static PyObject *
 790 utf_32_encode(PyObject *self,
 791             PyObject *args)
 792 {
 793     PyObject *str, *v;
 794     const char *errors = NULL;
 795     int byteorder = 0;
 796
 797     if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
 798                           &str, &errors, &byteorder))
 799         return NULL;
 800
 801     str = PyUnicode_FromObject(str);
 802     if (str == NULL)
 803         return NULL;
 804     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
 805                                           PyUnicode_GET_SIZE(str),
 806                                           errors,
 807                                           byteorder),
 808                     PyUnicode_GET_SIZE(str));
 809     Py_DECREF(str);
 810     return v;
 811 }
 812
 813 static PyObject *
 814 utf_32_le_encode(PyObject *self,
 815                  PyObject *args)
 816 {
 817     PyObject *str, *v;
 818     const char *errors = NULL;
 819
 820     if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
 821                           &str, &errors))
 822         return NULL;
 823
 824     str = PyUnicode_FromObject(str);
 825     if (str == NULL)
 826         return NULL;
 827     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
 828                                              PyUnicode_GET_SIZE(str),
 829                                              errors,
 830                                              -1),
 831                        PyUnicode_GET_SIZE(str));
 832     Py_DECREF(str);
 833     return v;
 834 }
 835
 836 static PyObject *
 837 utf_32_be_encode(PyObject *self,
 838                  PyObject *args)
 839 {
 840     PyObject *str, *v;
 841     const char *errors = NULL;
 842
 843     if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
 844                           &str, &errors))
 845         return NULL;
 846
 847     str = PyUnicode_FromObject(str);
 848     if (str == NULL)
 849         return NULL;
 850     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
 851                                           PyUnicode_GET_SIZE(str),
 852                                           errors,
 853                                           +1),
 854                     PyUnicode_GET_SIZE(str));
 855     Py_DECREF(str);
 856     return v;
 857 }
 858
 859 static PyObject *
 860 unicode_escape_encode(PyObject *self,
 861                      PyObject *args)
 862 {
 863     PyObject *str, *v;
 864     const char *errors = NULL;
 865
 866     if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
 867                           &str, &errors))
 868         return NULL;
 869
 870     str = PyUnicode_FromObject(str);
 871     if (str == NULL)
 872         return NULL;
 873     v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
 874                                                   PyUnicode_GET_SIZE(str)),
 875                     PyUnicode_GET_SIZE(str));
 876     Py_DECREF(str);
 877     return v;
 878 }
 879
 880 static PyObject *
 881 raw_unicode_escape_encode(PyObject *self,
 882                         PyObject *args)
 883 {
 884     PyObject *str, *v;
 885     const char *errors = NULL;
 886
 887     if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
 888                           &str, &errors))
 889         return NULL;
 890
 891     str = PyUnicode_FromObject(str);
 892     if (str == NULL)
 893         return NULL;
 894     v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
 895                                PyUnicode_AS_UNICODE(str),
 896                                PyUnicode_GET_SIZE(str)),
 897                     PyUnicode_GET_SIZE(str));
 898     Py_DECREF(str);
 899     return v;
 900 }
 901
 902 static PyObject *
 903 latin_1_encode(PyObject *self,
 904                PyObject *args)
 905 {
 906     PyObject *str, *v;
 907     const char *errors = NULL;
 908
 909     if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
 910                           &str, &errors))
 911         return NULL;
 912
 913     str = PyUnicode_FromObject(str);
 914     if (str == NULL)
 915         return NULL;
 916     v = codec_tuple(PyUnicode_EncodeLatin1(
 917                                PyUnicode_AS_UNICODE(str),
 918                                PyUnicode_GET_SIZE(str),
 919                                errors),
 920                     PyUnicode_GET_SIZE(str));
 921     Py_DECREF(str);
 922     return v;
 923 }
 924
 925 static PyObject *
 926 ascii_encode(PyObject *self,
 927              PyObject *args)
 928 {
 929     PyObject *str, *v;
 930     const char *errors = NULL;
 931
 932     if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
 933                           &str, &errors))
 934         return NULL;
 935
 936     str = PyUnicode_FromObject(str);
 937     if (str == NULL)
 938         return NULL;
 939     v = codec_tuple(PyUnicode_EncodeASCII(
 940                                PyUnicode_AS_UNICODE(str),
 941                                PyUnicode_GET_SIZE(str),
 942                                errors),
 943                     PyUnicode_GET_SIZE(str));
 944     Py_DECREF(str);
 945     return v;
 946 }
 947
 948 static PyObject *
 949 charmap_encode(PyObject *self,
 950              PyObject *args)
 951 {
 952     PyObject *str, *v;
 953     const char *errors = NULL;
 954     PyObject *mapping = NULL;
 955
 956     if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
 957                           &str, &errors, &mapping))
 958         return NULL;
 959     if (mapping == Py_None)
 960         mapping = NULL;
 961
 962     str = PyUnicode_FromObject(str);
 963     if (str == NULL)
 964         return NULL;
 965     v = codec_tuple(PyUnicode_EncodeCharmap(
 966                                PyUnicode_AS_UNICODE(str),
 967                                PyUnicode_GET_SIZE(str),
 968                                mapping,
 969                                errors),
 970                     PyUnicode_GET_SIZE(str));
 971     Py_DECREF(str);
 972     return v;
 973 }
 974
 975 static PyObject*
 976 charmap_build(PyObject *self, PyObject *args)
 977 {
 978     PyObject *map;
 979     if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
 980         return NULL;
 981     return PyUnicode_BuildEncodingMap(map);
 982 }
 983
 984 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 985
 986 static PyObject *
 987 mbcs_encode(PyObject *self,
 988             PyObject *args)
 989 {
 990     PyObject *str, *v;
 991     const char *errors = NULL;
 992
 993     if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
 994                           &str, &errors))
 995         return NULL;
 996
 997     str = PyUnicode_FromObject(str);
 998     if (str == NULL)
 999         return NULL;
1000     v = codec_tuple(PyUnicode_EncodeMBCS(
1001                                PyUnicode_AS_UNICODE(str),
1002                                PyUnicode_GET_SIZE(str),
1003                                errors),
1004                     PyUnicode_GET_SIZE(str));
1005     Py_DECREF(str);
1006     return v;
1007 }
1008
1009 #endif /* MS_WINDOWS */
1010 #endif /* Py_USING_UNICODE */
1011
1012 /* --- Error handler registry --------------------------------------------- */
1013
1014 PyDoc_STRVAR(register_error__doc__,
1015 "register_error(errors, handler)\n\
1016 \n\
1017 Register the specified error handler under the name\n\
1018 errors. handler must be a callable object, that\n\
1019 will be called with an exception instance containing\n\
1020 information about the location of the encoding/decoding\n\
1021 error and must return a (replacement, new position) tuple.");
1022
1023 static PyObject *register_error(PyObject *self, PyObject *args)
1024 {
1025     const char *name;
1026     PyObject *handler;
1027
1028     if (!PyArg_ParseTuple(args, "sO:register_error",
1029                           &name, &handler))
1030         return NULL;
1031     if (PyCodec_RegisterError(name, handler))
1032         return NULL;
1033     Py_RETURN_NONE;
1034 }
1035
1036 PyDoc_STRVAR(lookup_error__doc__,
1037 "lookup_error(errors) -> handler\n\
1038 \n\
1039 Return the error handler for the specified error handling name\n\
1040 or raise a LookupError, if no handler exists under this name.");
1041
1042 static PyObject *lookup_error(PyObject *self, PyObject *args)
1043 {
1044     const char *name;
1045
1046     if (!PyArg_ParseTuple(args, "s:lookup_error",
1047                           &name))
1048         return NULL;
1049     return PyCodec_LookupError(name);
1050 }
1051
1052 /* --- Module API --------------------------------------------------------- */
1053
1054 static PyMethodDef _codecs_functions[] = {
1055     {"register",                codec_register,                 METH_O,
1056         register__doc__},
1057     {"lookup",                  codec_lookup,                   METH_VARARGS,
1058         lookup__doc__},
1059     {"encode",                  codec_encode,                   METH_VARARGS,
1060         encode__doc__},
1061     {"decode",                  codec_decode,                   METH_VARARGS,
1062         decode__doc__},
1063     {"escape_encode",           escape_encode,                  METH_VARARGS},
1064     {"escape_decode",           escape_decode,                  METH_VARARGS},
1065 #ifdef Py_USING_UNICODE
1066     {"utf_8_encode",            utf_8_encode,                   METH_VARARGS},
1067     {"utf_8_decode",            utf_8_decode,                   METH_VARARGS},
1068     {"utf_7_encode",            utf_7_encode,                   METH_VARARGS},
1069     {"utf_7_decode",            utf_7_decode,                   METH_VARARGS},
1070     {"utf_16_encode",           utf_16_encode,                  METH_VARARGS},
1071     {"utf_16_le_encode",        utf_16_le_encode,               METH_VARARGS},
1072     {"utf_16_be_encode",        utf_16_be_encode,               METH_VARARGS},
1073     {"utf_16_decode",           utf_16_decode,                  METH_VARARGS},
1074     {"utf_16_le_decode",        utf_16_le_decode,               METH_VARARGS},
1075     {"utf_16_be_decode",        utf_16_be_decode,               METH_VARARGS},
1076     {"utf_16_ex_decode",        utf_16_ex_decode,               METH_VARARGS},
1077     {"utf_32_encode",           utf_32_encode,                  METH_VARARGS},
1078     {"utf_32_le_encode",        utf_32_le_encode,               METH_VARARGS},
1079     {"utf_32_be_encode",        utf_32_be_encode,               METH_VARARGS},
1080     {"utf_32_decode",           utf_32_decode,                  METH_VARARGS},
1081     {"utf_32_le_decode",        utf_32_le_decode,               METH_VARARGS},
1082     {"utf_32_be_decode",        utf_32_be_decode,               METH_VARARGS},
1083     {"utf_32_ex_decode",        utf_32_ex_decode,               METH_VARARGS},
1084     {"unicode_escape_encode",   unicode_escape_encode,          METH_VARARGS},
1085     {"unicode_escape_decode",   unicode_escape_decode,          METH_VARARGS},
1086     {"unicode_internal_encode", unicode_internal_encode,        METH_VARARGS},
1087     {"unicode_internal_decode", unicode_internal_decode,        METH_VARARGS},
1088     {"raw_unicode_escape_encode", raw_unicode_escape_encode,    METH_VARARGS},
1089     {"raw_unicode_escape_decode", raw_unicode_escape_decode,    METH_VARARGS},
1090     {"latin_1_encode",          latin_1_encode,                 METH_VARARGS},
1091     {"latin_1_decode",          latin_1_decode,                 METH_VARARGS},
1092     {"ascii_encode",            ascii_encode,                   METH_VARARGS},
1093     {"ascii_decode",            ascii_decode,                   METH_VARARGS},
1094     {"charmap_encode",          charmap_encode,                 METH_VARARGS},
1095     {"charmap_decode",          charmap_decode,                 METH_VARARGS},
1096     {"charmap_build",           charmap_build,                  METH_VARARGS},
1097     {"readbuffer_encode",       readbuffer_encode,              METH_VARARGS},
1098     {"charbuffer_encode",       charbuffer_encode,              METH_VARARGS},
1099 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1100     {"mbcs_encode",             mbcs_encode,                    METH_VARARGS},
1101     {"mbcs_decode",             mbcs_decode,                    METH_VARARGS},
1102 #endif
1103 #endif /* Py_USING_UNICODE */
1104     {"register_error",          register_error,                 METH_VARARGS,
1105         register_error__doc__},
1106     {"lookup_error",            lookup_error,                   METH_VARARGS,
1107         lookup_error__doc__},
1108     {NULL, NULL}                /* sentinel */
1109 };
1110
1111 PyMODINIT_FUNC
1112 init_codecs(void)
1113 {
1114     Py_InitModule("_codecs", _codecs_functions);
1115 }