Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WINDOWS
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax(void)
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          /* MvL said unicode->str[] may be signed.  Python generally assumes
 136           * an int contains at least 32 bits, and we don't use more than
 137           * 32 bits even in a UCS4 build, so casting to unsigned int should
 138           * be correct.
 139           */
 140          (unsigned int)unicode->str[0] < 256U &&
 141          unicode_latin1[unicode->str[0]] == unicode)) {
 142         PyErr_SetString(PyExc_SystemError,
 143                         "can't resize shared unicode objects");
 144         return -1;
 145     }
 146
 147     /* We allocate one more byte to make sure the string is
 148        Ux0000 terminated -- XXX is this needed ? */
 149     oldstr = unicode->str;
 150     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 151     if (!unicode->str) {
 152         unicode->str = oldstr;
 153         PyErr_NoMemory();
 154         return -1;
 155     }
 156     unicode->str[length] = 0;
 157     unicode->length = length;
 158
 159  reset:
 160     /* Reset the object caches */
 161     if (unicode->defenc) {
 162         Py_DECREF(unicode->defenc);
 163         unicode->defenc = NULL;
 164     }
 165     unicode->hash = -1;
 166
 167     return 0;
 168 }
 169
 170 /* We allocate one more byte to make sure the string is
 171    Ux0000 terminated -- XXX is this needed ?
 172
 173    XXX This allocator could further be enhanced by assuring that the
 174        free list never reduces its size below 1.
 175
 176 */
 177
 178 static
 179 PyUnicodeObject *_PyUnicode_New(int length)
 180 {
 181     register PyUnicodeObject *unicode;
 182
 183     /* Optimization fo empty strings */
 184     if (length == 0 && unicode_empty != NULL) {
 185         Py_INCREF(unicode_empty);
 186         return unicode_empty;
 187     }
 188
 189     /* Unicode freelist & memory allocation */
 190     if (unicode_freelist) {
 191         unicode = unicode_freelist;
 192         unicode_freelist = *(PyUnicodeObject **)unicode;
 193         unicode_freelist_size--;
 194         if (unicode->str) {
 195             /* Keep-Alive optimization: we only upsize the buffer,
 196                never downsize it. */
 197             if ((unicode->length < length) &&
 198                 unicode_resize(unicode, length) < 0) {
 199                 PyMem_DEL(unicode->str);
 200                 goto onError;
 201             }
 202         }
 203         else {
 204             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 205         }
 206         PyObject_INIT(unicode, &PyUnicode_Type);
 207     }
 208     else {
 209         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 210         if (unicode == NULL)
 211             return NULL;
 212         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 213     }
 214
 215     if (!unicode->str) {
 216         PyErr_NoMemory();
 217         goto onError;
 218     }
 219     /* Initialize the first element to guard against cases where
 220      * the caller fails before initializing str -- unicode_resize()
 221      * reads str[0], and the Keep-Alive optimization can keep memory
 222      * allocated for str alive across a call to unicode_dealloc(unicode).
 223      * We don't want unicode_resize to read uninitialized memory in
 224      * that case.
 225      */
 226     unicode->str[0] = 0;
 227     unicode->str[length] = 0;
 228     unicode->length = length;
 229     unicode->hash = -1;
 230     unicode->defenc = NULL;
 231     return unicode;
 232
 233  onError:
 234     _Py_ForgetReference((PyObject *)unicode);
 235     PyObject_Del(unicode);
 236     return NULL;
 237 }
 238
 239 static
 240 void unicode_dealloc(register PyUnicodeObject *unicode)
 241 {
 242     if (PyUnicode_CheckExact(unicode) &&
 243         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 244         /* Keep-Alive optimization */
 245         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 246             PyMem_DEL(unicode->str);
 247             unicode->str = NULL;
 248             unicode->length = 0;
 249         }
 250         if (unicode->defenc) {
 251             Py_DECREF(unicode->defenc);
 252             unicode->defenc = NULL;
 253         }
 254         /* Add to free list */
 255         *(PyUnicodeObject **)unicode = unicode_freelist;
 256         unicode_freelist = unicode;
 257         unicode_freelist_size++;
 258     }
 259     else {
 260         PyMem_DEL(unicode->str);
 261         Py_XDECREF(unicode->defenc);
 262         unicode->ob_type->tp_free((PyObject *)unicode);
 263     }
 264 }
 265
 266 int PyUnicode_Resize(PyObject **unicode, int length)
 267 {
 268     register PyUnicodeObject *v;
 269
 270     /* Argument checks */
 271     if (unicode == NULL) {
 272         PyErr_BadInternalCall();
 273         return -1;
 274     }
 275     v = (PyUnicodeObject *)*unicode;
 276     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
 277         PyErr_BadInternalCall();
 278         return -1;
 279     }
 280
 281     /* Resizing unicode_empty and single character objects is not
 282        possible since these are being shared. We simply return a fresh
 283        copy with the same Unicode content. */
 284     if (v->length != length &&
 285         (v == unicode_empty || v->length == 1)) {
 286         PyUnicodeObject *w = _PyUnicode_New(length);
 287         if (w == NULL)
 288             return -1;
 289         Py_UNICODE_COPY(w->str, v->str,
 290                         length < v->length ? length : v->length);
 291         Py_DECREF(*unicode);
 292         *unicode = (PyObject *)w;
 293         return 0;
 294     }
 295
 296     /* Note that we don't have to modify *unicode for unshared Unicode
 297        objects, since we can modify them in-place. */
 298     return unicode_resize(v, length);
 299 }
 300
 301 /* Internal API for use in unicodeobject.c only ! */
 302 #define _PyUnicode_Resize(unicodevar, length) \
 303         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 304
 305 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 306                                 int size)
 307 {
 308     PyUnicodeObject *unicode;
 309
 310     /* If the Unicode data is known at construction time, we can apply
 311        some optimizations which share commonly used objects. */
 312     if (u != NULL) {
 313
 314         /* Optimization for empty strings */
 315         if (size == 0 && unicode_empty != NULL) {
 316             Py_INCREF(unicode_empty);
 317             return (PyObject *)unicode_empty;
 318         }
 319
 320         /* Single character Unicode objects in the Latin-1 range are
 321            shared when using this constructor */
 322         if (size == 1 && *u < 256) {
 323             unicode = unicode_latin1[*u];
 324             if (!unicode) {
 325                 unicode = _PyUnicode_New(1);
 326                 if (!unicode)
 327                     return NULL;
 328                 unicode->str[0] = *u;
 329                 unicode_latin1[*u] = unicode;
 330             }
 331             Py_INCREF(unicode);
 332             return (PyObject *)unicode;
 333         }
 334     }
 335
 336     unicode = _PyUnicode_New(size);
 337     if (!unicode)
 338         return NULL;
 339
 340     /* Copy the Unicode data into the new object */
 341     if (u != NULL)
 342         Py_UNICODE_COPY(unicode->str, u, size);
 343
 344     return (PyObject *)unicode;
 345 }
 346
 347 #ifdef HAVE_WCHAR_H
 348
 349 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 350                                  int size)
 351 {
 352     PyUnicodeObject *unicode;
 353
 354     if (w == NULL) {
 355         PyErr_BadInternalCall();
 356         return NULL;
 357     }
 358
 359     unicode = _PyUnicode_New(size);
 360     if (!unicode)
 361         return NULL;
 362
 363     /* Copy the wchar_t data into the new object */
 364 #ifdef HAVE_USABLE_WCHAR_T
 365     memcpy(unicode->str, w, size * sizeof(wchar_t));
 366 #else
 367     {
 368         register Py_UNICODE *u;
 369         register int i;
 370         u = PyUnicode_AS_UNICODE(unicode);
 371         for (i = size; i > 0; i--)
 372             *u++ = *w++;
 373     }
 374 #endif
 375
 376     return (PyObject *)unicode;
 377 }
 378
 379 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 380                          register wchar_t *w,
 381                          int size)
 382 {
 383     if (unicode == NULL) {
 384         PyErr_BadInternalCall();
 385         return -1;
 386     }
 387
 388     /* If possible, try to copy the 0-termination as well */
 389     if (size > PyUnicode_GET_SIZE(unicode))
 390         size = PyUnicode_GET_SIZE(unicode) + 1;
 391
 392 #ifdef HAVE_USABLE_WCHAR_T
 393     memcpy(w, unicode->str, size * sizeof(wchar_t));
 394 #else
 395     {
 396         register Py_UNICODE *u;
 397         register int i;
 398         u = PyUnicode_AS_UNICODE(unicode);
 399         for (i = size; i > 0; i--)
 400             *w++ = *u++;
 401     }
 402 #endif
 403
 404     if (size > PyUnicode_GET_SIZE(unicode))
 405         return PyUnicode_GET_SIZE(unicode);
 406     else
 407     return size;
 408 }
 409
 410 #endif
 411
 412 PyObject *PyUnicode_FromOrdinal(int ordinal)
 413 {
 414     Py_UNICODE s[1];
 415
 416 #ifdef Py_UNICODE_WIDE
 417     if (ordinal < 0 || ordinal > 0x10ffff) {
 418         PyErr_SetString(PyExc_ValueError,
 419                         "unichr() arg not in range(0x110000) "
 420                         "(wide Python build)");
 421         return NULL;
 422     }
 423 #else
 424     if (ordinal < 0 || ordinal > 0xffff) {
 425         PyErr_SetString(PyExc_ValueError,
 426                         "unichr() arg not in range(0x10000) "
 427                         "(narrow Python build)");
 428         return NULL;
 429     }
 430 #endif
 431
 432     s[0] = (Py_UNICODE)ordinal;
 433     return PyUnicode_FromUnicode(s, 1);
 434 }
 435
 436 PyObject *PyUnicode_FromObject(register PyObject *obj)
 437 {
 438     /* XXX Perhaps we should make this API an alias of
 439            PyObject_Unicode() instead ?! */
 440     if (PyUnicode_CheckExact(obj)) {
 441         Py_INCREF(obj);
 442         return obj;
 443     }
 444     if (PyUnicode_Check(obj)) {
 445         /* For a Unicode subtype that's not a Unicode object,
 446            return a true Unicode object with the same data. */
 447         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 448                                      PyUnicode_GET_SIZE(obj));
 449     }
 450     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 451 }
 452
 453 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 454                                       const char *encoding,
 455                                       const char *errors)
 456 {
 457     const char *s = NULL;
 458     int len;
 459     PyObject *v;
 460
 461     if (obj == NULL) {
 462         PyErr_BadInternalCall();
 463         return NULL;
 464     }
 465
 466 #if 0
 467     /* For b/w compatibility we also accept Unicode objects provided
 468        that no encodings is given and then redirect to
 469        PyObject_Unicode() which then applies the additional logic for
 470        Unicode subclasses.
 471
 472        NOTE: This API should really only be used for object which
 473              represent *encoded* Unicode !
 474
 475     */
 476         if (PyUnicode_Check(obj)) {
 477             if (encoding) {
 478                 PyErr_SetString(PyExc_TypeError,
 479                                 "decoding Unicode is not supported");
 480             return NULL;
 481             }
 482         return PyObject_Unicode(obj);
 483             }
 484 #else
 485     if (PyUnicode_Check(obj)) {
 486         PyErr_SetString(PyExc_TypeError,
 487                         "decoding Unicode is not supported");
 488         return NULL;
 489         }
 490 #endif
 491
 492     /* Coerce object */
 493     if (PyString_Check(obj)) {
 494             s = PyString_AS_STRING(obj);
 495             len = PyString_GET_SIZE(obj);
 496             }
 497     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 498         /* Overwrite the error message with something more useful in
 499            case of a TypeError. */
 500         if (PyErr_ExceptionMatches(PyExc_TypeError))
 501         PyErr_Format(PyExc_TypeError,
 502                          "coercing to Unicode: need string or buffer, "
 503                          "%.80s found",
 504                      obj->ob_type->tp_name);
 505         goto onError;
 506     }
 507
 508     /* Convert to Unicode */
 509     if (len == 0) {
 510         Py_INCREF(unicode_empty);
 511         v = (PyObject *)unicode_empty;
 512     }
 513     else
 514         v = PyUnicode_Decode(s, len, encoding, errors);
 515
 516     return v;
 517
 518  onError:
 519     return NULL;
 520 }
 521
 522 PyObject *PyUnicode_Decode(const char *s,
 523                            int size,
 524                            const char *encoding,
 525                            const char *errors)
 526 {
 527     PyObject *buffer = NULL, *unicode;
 528
 529     if (encoding == NULL)
 530         encoding = PyUnicode_GetDefaultEncoding();
 531
 532     /* Shortcuts for common default encodings */
 533     if (strcmp(encoding, "utf-8") == 0)
 534         return PyUnicode_DecodeUTF8(s, size, errors);
 535     else if (strcmp(encoding, "latin-1") == 0)
 536         return PyUnicode_DecodeLatin1(s, size, errors);
 537 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 538     else if (strcmp(encoding, "mbcs") == 0)
 539         return PyUnicode_DecodeMBCS(s, size, errors);
 540 #endif
 541     else if (strcmp(encoding, "ascii") == 0)
 542         return PyUnicode_DecodeASCII(s, size, errors);
 543
 544     /* Decode via the codec registry */
 545     buffer = PyBuffer_FromMemory((void *)s, size);
 546     if (buffer == NULL)
 547         goto onError;
 548     unicode = PyCodec_Decode(buffer, encoding, errors);
 549     if (unicode == NULL)
 550         goto onError;
 551     if (!PyUnicode_Check(unicode)) {
 552         PyErr_Format(PyExc_TypeError,
 553                      "decoder did not return an unicode object (type=%.400s)",
 554                      unicode->ob_type->tp_name);
 555         Py_DECREF(unicode);
 556         goto onError;
 557     }
 558     Py_DECREF(buffer);
 559     return unicode;
 560
 561  onError:
 562     Py_XDECREF(buffer);
 563     return NULL;
 564 }
 565
 566 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
 567                                     const char *encoding,
 568                                     const char *errors)
 569 {
 570     PyObject *v;
 571
 572     if (!PyUnicode_Check(unicode)) {
 573         PyErr_BadArgument();
 574         goto onError;
 575     }
 576
 577     if (encoding == NULL)
 578         encoding = PyUnicode_GetDefaultEncoding();
 579
 580     /* Decode via the codec registry */
 581     v = PyCodec_Decode(unicode, encoding, errors);
 582     if (v == NULL)
 583         goto onError;
 584     return v;
 585
 586  onError:
 587     return NULL;
 588 }
 589
 590 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 591                            int size,
 592                            const char *encoding,
 593                            const char *errors)
 594 {
 595     PyObject *v, *unicode;
 596
 597     unicode = PyUnicode_FromUnicode(s, size);
 598     if (unicode == NULL)
 599         return NULL;
 600     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 601     Py_DECREF(unicode);
 602     return v;
 603 }
 604
 605 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
 606                                     const char *encoding,
 607                                     const char *errors)
 608 {
 609     PyObject *v;
 610
 611     if (!PyUnicode_Check(unicode)) {
 612         PyErr_BadArgument();
 613         goto onError;
 614     }
 615
 616     if (encoding == NULL)
 617         encoding = PyUnicode_GetDefaultEncoding();
 618
 619     /* Encode via the codec registry */
 620     v = PyCodec_Encode(unicode, encoding, errors);
 621     if (v == NULL)
 622         goto onError;
 623     return v;
 624
 625  onError:
 626     return NULL;
 627 }
 628
 629 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 630                                     const char *encoding,
 631                                     const char *errors)
 632 {
 633     PyObject *v;
 634
 635     if (!PyUnicode_Check(unicode)) {
 636         PyErr_BadArgument();
 637         goto onError;
 638     }
 639
 640     if (encoding == NULL)
 641         encoding = PyUnicode_GetDefaultEncoding();
 642
 643     /* Shortcuts for common default encodings */
 644     if (errors == NULL) {
 645         if (strcmp(encoding, "utf-8") == 0)
 646             return PyUnicode_AsUTF8String(unicode);
 647         else if (strcmp(encoding, "latin-1") == 0)
 648             return PyUnicode_AsLatin1String(unicode);
 649 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
 650         else if (strcmp(encoding, "mbcs") == 0)
 651             return PyUnicode_AsMBCSString(unicode);
 652 #endif
 653         else if (strcmp(encoding, "ascii") == 0)
 654             return PyUnicode_AsASCIIString(unicode);
 655     }
 656
 657     /* Encode via the codec registry */
 658     v = PyCodec_Encode(unicode, encoding, errors);
 659     if (v == NULL)
 660         goto onError;
 661     if (!PyString_Check(v)) {
 662         PyErr_Format(PyExc_TypeError,
 663                      "encoder did not return a string object (type=%.400s)",
 664                      v->ob_type->tp_name);
 665         Py_DECREF(v);
 666         goto onError;
 667     }
 668     return v;
 669
 670  onError:
 671     return NULL;
 672 }
 673
 674 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 675                                             const char *errors)
 676 {
 677     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 678
 679     if (v)
 680         return v;
 681     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 682     if (v && errors == NULL)
 683         ((PyUnicodeObject *)unicode)->defenc = v;
 684     return v;
 685 }
 686
 687 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 688 {
 689     if (!PyUnicode_Check(unicode)) {
 690         PyErr_BadArgument();
 691         goto onError;
 692     }
 693     return PyUnicode_AS_UNICODE(unicode);
 694
 695  onError:
 696     return NULL;
 697 }
 698
 699 int PyUnicode_GetSize(PyObject *unicode)
 700 {
 701     if (!PyUnicode_Check(unicode)) {
 702         PyErr_BadArgument();
 703         goto onError;
 704     }
 705     return PyUnicode_GET_SIZE(unicode);
 706
 707  onError:
 708     return -1;
 709 }
 710
 711 const char *PyUnicode_GetDefaultEncoding(void)
 712 {
 713     return unicode_default_encoding;
 714 }
 715
 716 int PyUnicode_SetDefaultEncoding(const char *encoding)
 717 {
 718     PyObject *v;
 719
 720     /* Make sure the encoding is valid. As side effect, this also
 721        loads the encoding into the codec registry cache. */
 722     v = _PyCodec_Lookup(encoding);
 723     if (v == NULL)
 724         goto onError;
 725     Py_DECREF(v);
 726     strncpy(unicode_default_encoding,
 727             encoding,
 728             sizeof(unicode_default_encoding));
 729     return 0;
 730
 731  onError:
 732     return -1;
 733 }
 734
 735 /* error handling callback helper:
 736    build arguments, call the callback and check the arguments,
 737    if no exception occurred, copy the replacement to the output
 738    and adjust various state variables.
 739    return 0 on success, -1 on error
 740 */
 741
 742 static
 743 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
 744                  const char *encoding, const char *reason,
 745                  const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
 746                  PyObject **output, int *outpos, Py_UNICODE **outptr)
 747 {
 748     static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
 749
 750     PyObject *restuple = NULL;
 751     PyObject *repunicode = NULL;
 752     int outsize = PyUnicode_GET_SIZE(*output);
 753     int requiredsize;
 754     int newpos;
 755     Py_UNICODE *repptr;
 756     int repsize;
 757     int res = -1;
 758
 759     if (*errorHandler == NULL) {
 760         *errorHandler = PyCodec_LookupError(errors);
 761         if (*errorHandler == NULL)
 762            goto onError;
 763     }
 764
 765     if (*exceptionObject == NULL) {
 766         *exceptionObject = PyUnicodeDecodeError_Create(
 767             encoding, input, insize, *startinpos, *endinpos, reason);
 768         if (*exceptionObject == NULL)
 769            goto onError;
 770     }
 771     else {
 772         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
 773             goto onError;
 774         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
 775             goto onError;
 776         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 777             goto onError;
 778     }
 779
 780     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
 781     if (restuple == NULL)
 782         goto onError;
 783     if (!PyTuple_Check(restuple)) {
 784         PyErr_Format(PyExc_TypeError, &argparse[4]);
 785         goto onError;
 786     }
 787     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
 788         goto onError;
 789     if (newpos<0)
 790         newpos = insize+newpos;
 791     if (newpos<0 || newpos>insize) {
 792         PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
 793         goto onError;
 794     }
 795
 796     /* need more space? (at least enough for what we
 797        have+the replacement+the rest of the string (starting
 798        at the new input position), so we won't have to check space
 799        when there are no errors in the rest of the string) */
 800     repptr = PyUnicode_AS_UNICODE(repunicode);
 801     repsize = PyUnicode_GET_SIZE(repunicode);
 802     requiredsize = *outpos + repsize + insize-newpos;
 803     if (requiredsize > outsize) {
 804         if (requiredsize<2*outsize)
 805             requiredsize = 2*outsize;
 806         if (PyUnicode_Resize(output, requiredsize) < 0)
 807             goto onError;
 808         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
 809     }
 810     *endinpos = newpos;
 811     *inptr = input + newpos;
 812     Py_UNICODE_COPY(*outptr, repptr, repsize);
 813     *outptr += repsize;
 814     *outpos += repsize;
 815     /* we made it! */
 816     res = 0;
 817
 818     onError:
 819     Py_XDECREF(restuple);
 820     return res;
 821 }
 822
 823 /* --- UTF-7 Codec -------------------------------------------------------- */
 824
 825 /* see RFC2152 for details */
 826
 827 static
 828 char utf7_special[128] = {
 829     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 830        encoded:
 831            0 - not special
 832            1 - special
 833            2 - whitespace (optional)
 834            3 - RFC2152 Set O (optional) */
 835     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 836     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 837     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 838     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 839     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 840     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 841     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 842     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 843
 844 };
 845
 846 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
 847    warnings about the comparison always being false; since
 848    utf7_special[0] is 1, we can safely make that one comparison
 849    true  */
 850
 851 #define SPECIAL(c, encodeO, encodeWS) \
 852     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
 853      (encodeWS && (utf7_special[(c)] == 2)) || \
 854      (encodeO && (utf7_special[(c)] == 3)))
 855
 856 #define B64(n)  \
 857     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 858 #define B64CHAR(c) \
 859     (isalnum(c) || (c) == '+' || (c) == '/')
 860 #define UB64(c) \
 861     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
 862      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
 863
 864 #define ENCODE(out, ch, bits)                   \
 865     while (bits >= 6) {                         \
 866         *out++ = B64(ch >> (bits-6));           \
 867         bits -= 6;                              \
 868     }
 869
 870 #define DECODE(out, ch, bits, surrogate)                                \
 871     while (bits >= 16) {                                                \
 872         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
 873         bits -= 16;                                                     \
 874         if (surrogate) {                                                \
 875             /* We have already generated an error for the high surrogate \
 876                so let's not bother seeing if the low surrogate is correct or not */ \
 877             surrogate = 0;                                              \
 878         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
 879             /* This is a surrogate pair. Unfortunately we can't represent \
 880                it in a 16-bit character */                              \
 881             surrogate = 1;                                              \
 882             errmsg = "code pairs are not supported";                    \
 883             goto utf7Error;                                             \
 884         } else {                                                        \
 885             *out++ = outCh;                                             \
 886         }                                                               \
 887     }
 888
 889 PyObject *PyUnicode_DecodeUTF7(const char *s,
 890                                int size,
 891                                const char *errors)
 892 {
 893     const char *starts = s;
 894     int startinpos;
 895     int endinpos;
 896     int outpos;
 897     const char *e;
 898     PyUnicodeObject *unicode;
 899     Py_UNICODE *p;
 900     const char *errmsg = "";
 901     int inShift = 0;
 902     unsigned int bitsleft = 0;
 903     unsigned long charsleft = 0;
 904     int surrogate = 0;
 905     PyObject *errorHandler = NULL;
 906     PyObject *exc = NULL;
 907
 908     unicode = _PyUnicode_New(size);
 909     if (!unicode)
 910         return NULL;
 911     if (size == 0)
 912         return (PyObject *)unicode;
 913
 914     p = unicode->str;
 915     e = s + size;
 916
 917     while (s < e) {
 918         Py_UNICODE ch;
 919         restart:
 920         ch = *s;
 921
 922         if (inShift) {
 923             if ((ch == '-') || !B64CHAR(ch)) {
 924                 inShift = 0;
 925                 s++;
 926
 927                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 928                 if (bitsleft >= 6) {
 929                     /* The shift sequence has a partial character in it. If
 930                        bitsleft < 6 then we could just classify it as padding
 931                        but that is not the case here */
 932
 933                     errmsg = "partial character in shift sequence";
 934                     goto utf7Error;
 935                 }
 936                 /* According to RFC2152 the remaining bits should be zero. We
 937                    choose to signal an error/insert a replacement character
 938                    here so indicate the potential of a misencoded character. */
 939
 940                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 941                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 942                     errmsg = "non-zero padding bits in shift sequence";
 943                     goto utf7Error;
 944                 }
 945
 946                 if (ch == '-') {
 947                     if ((s < e) && (*(s) == '-')) {
 948                         *p++ = '-';
 949                         inShift = 1;
 950                     }
 951                 } else if (SPECIAL(ch,0,0)) {
 952                     errmsg = "unexpected special character";
 953                         goto utf7Error;
 954                 } else  {
 955                     *p++ = ch;
 956                 }
 957             } else {
 958                 charsleft = (charsleft << 6) | UB64(ch);
 959                 bitsleft += 6;
 960                 s++;
 961                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 962             }
 963         }
 964         else if ( ch == '+' ) {
 965             startinpos = s-starts;
 966             s++;
 967             if (s < e && *s == '-') {
 968                 s++;
 969                 *p++ = '+';
 970             } else
 971             {
 972                 inShift = 1;
 973                 bitsleft = 0;
 974             }
 975         }
 976         else if (SPECIAL(ch,0,0)) {
 977             errmsg = "unexpected special character";
 978             s++;
 979                 goto utf7Error;
 980         }
 981         else {
 982             *p++ = ch;
 983             s++;
 984         }
 985         continue;
 986     utf7Error:
 987         outpos = p-PyUnicode_AS_UNICODE(unicode);
 988         endinpos = s-starts;
 989         if (unicode_decode_call_errorhandler(
 990              errors, &errorHandler,
 991              "utf7", errmsg,
 992              starts, size, &startinpos, &endinpos, &exc, &s,
 993              (PyObject **)&unicode, &outpos, &p))
 994         goto onError;
 995     }
 996
 997     if (inShift) {
 998         outpos = p-PyUnicode_AS_UNICODE(unicode);
 999         endinpos = size;
1000         if (unicode_decode_call_errorhandler(
1001              errors, &errorHandler,
1002              "utf7", "unterminated shift sequence",
1003              starts, size, &startinpos, &endinpos, &exc, &s,
1004              (PyObject **)&unicode, &outpos, &p))
1005             goto onError;
1006         if (s < e)
1007            goto restart;
1008     }
1009
1010     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1011         goto onError;
1012
1013     Py_XDECREF(errorHandler);
1014     Py_XDECREF(exc);
1015     return (PyObject *)unicode;
1016
1017 onError:
1018     Py_XDECREF(errorHandler);
1019     Py_XDECREF(exc);
1020     Py_DECREF(unicode);
1021     return NULL;
1022 }
1023
1024
1025 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1026                    int size,
1027                    int encodeSetO,
1028                    int encodeWhiteSpace,
1029                    const char *errors)
1030 {
1031     PyObject *v;
1032     /* It might be possible to tighten this worst case */
1033     unsigned int cbAllocated = 5 * size;
1034     int inShift = 0;
1035     int i = 0;
1036     unsigned int bitsleft = 0;
1037     unsigned long charsleft = 0;
1038     char * out;
1039     char * start;
1040
1041     if (size == 0)
1042                 return PyString_FromStringAndSize(NULL, 0);
1043
1044     v = PyString_FromStringAndSize(NULL, cbAllocated);
1045     if (v == NULL)
1046         return NULL;
1047
1048     start = out = PyString_AS_STRING(v);
1049     for (;i < size; ++i) {
1050         Py_UNICODE ch = s[i];
1051
1052         if (!inShift) {
1053             if (ch == '+') {
1054                 *out++ = '+';
1055                 *out++ = '-';
1056             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057                 charsleft = ch;
1058                 bitsleft = 16;
1059                 *out++ = '+';
1060                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1061                 inShift = bitsleft > 0;
1062             } else {
1063                 *out++ = (char) ch;
1064             }
1065         } else {
1066             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067                 *out++ = B64(charsleft << (6-bitsleft));
1068                 charsleft = 0;
1069                 bitsleft = 0;
1070                 /* Characters not in the BASE64 set implicitly unshift the sequence
1071                    so no '-' is required, except if the character is itself a '-' */
1072                 if (B64CHAR(ch) || ch == '-') {
1073                     *out++ = '-';
1074                 }
1075                 inShift = 0;
1076                 *out++ = (char) ch;
1077             } else {
1078                 bitsleft += 16;
1079                 charsleft = (charsleft << 16) | ch;
1080                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1081
1082                 /* If the next character is special then we dont' need to terminate
1083                    the shift sequence. If the next character is not a BASE64 character
1084                    or '-' then the shift sequence will be terminated implicitly and we
1085                    don't have to insert a '-'. */
1086
1087                 if (bitsleft == 0) {
1088                     if (i + 1 < size) {
1089                         Py_UNICODE ch2 = s[i+1];
1090
1091                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1092
1093                         } else if (B64CHAR(ch2) || ch2 == '-') {
1094                             *out++ = '-';
1095                             inShift = 0;
1096                         } else {
1097                             inShift = 0;
1098                         }
1099
1100                     }
1101                     else {
1102                         *out++ = '-';
1103                         inShift = 0;
1104                     }
1105                 }
1106             }
1107         }
1108     }
1109     if (bitsleft) {
1110         *out++= B64(charsleft << (6-bitsleft) );
1111         *out++ = '-';
1112     }
1113
1114     _PyString_Resize(&v, out - start);
1115     return v;
1116 }
1117
1118 #undef SPECIAL
1119 #undef B64
1120 #undef B64CHAR
1121 #undef UB64
1122 #undef ENCODE
1123 #undef DECODE
1124
1125 /* --- UTF-8 Codec -------------------------------------------------------- */
1126
1127 static
1128 char utf8_code_length[256] = {
1129     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1130        illegal prefix.  see RFC 2279 for details */
1131     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1147 };
1148
1149 PyObject *PyUnicode_DecodeUTF8(const char *s,
1150                                int size,
1151                                const char *errors)
1152 {
1153     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1154 }
1155
1156 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1157                                         int size,
1158                                         const char *errors,
1159                                         int *consumed)
1160 {
1161     const char *starts = s;
1162     int n;
1163     int startinpos;
1164     int endinpos;
1165     int outpos;
1166     const char *e;
1167     PyUnicodeObject *unicode;
1168     Py_UNICODE *p;
1169     const char *errmsg = "";
1170     PyObject *errorHandler = NULL;
1171     PyObject *exc = NULL;
1172
1173     /* Note: size will always be longer than the resulting Unicode
1174        character count */
1175     unicode = _PyUnicode_New(size);
1176     if (!unicode)
1177         return NULL;
1178     if (size == 0) {
1179         if (consumed)
1180             *consumed = 0;
1181         return (PyObject *)unicode;
1182     }
1183
1184     /* Unpack UTF-8 encoded data */
1185     p = unicode->str;
1186     e = s + size;
1187
1188     while (s < e) {
1189         Py_UCS4 ch = (unsigned char)*s;
1190
1191         if (ch < 0x80) {
1192             *p++ = (Py_UNICODE)ch;
1193             s++;
1194             continue;
1195         }
1196
1197         n = utf8_code_length[ch];
1198
1199         if (s + n > e) {
1200             if (consumed)
1201                 break;
1202             else {
1203                 errmsg = "unexpected end of data";
1204                 startinpos = s-starts;
1205                 endinpos = size;
1206                 goto utf8Error;
1207             }
1208         }
1209
1210         switch (n) {
1211
1212         case 0:
1213             errmsg = "unexpected code byte";
1214             startinpos = s-starts;
1215             endinpos = startinpos+1;
1216             goto utf8Error;
1217
1218         case 1:
1219             errmsg = "internal error";
1220             startinpos = s-starts;
1221             endinpos = startinpos+1;
1222             goto utf8Error;
1223
1224         case 2:
1225             if ((s[1] & 0xc0) != 0x80) {
1226                 errmsg = "invalid data";
1227                 startinpos = s-starts;
1228                 endinpos = startinpos+2;
1229                 goto utf8Error;
1230             }
1231             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1232             if (ch < 0x80) {
1233                 startinpos = s-starts;
1234                 endinpos = startinpos+2;
1235                 errmsg = "illegal encoding";
1236                 goto utf8Error;
1237             }
1238             else
1239                 *p++ = (Py_UNICODE)ch;
1240             break;
1241
1242         case 3:
1243             if ((s[1] & 0xc0) != 0x80 ||
1244                 (s[2] & 0xc0) != 0x80) {
1245                 errmsg = "invalid data";
1246                 startinpos = s-starts;
1247                 endinpos = startinpos+3;
1248                 goto utf8Error;
1249             }
1250             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1251             if (ch < 0x0800) {
1252                 /* Note: UTF-8 encodings of surrogates are considered
1253                    legal UTF-8 sequences;
1254
1255                    XXX For wide builds (UCS-4) we should probably try
1256                        to recombine the surrogates into a single code
1257                        unit.
1258                 */
1259                 errmsg = "illegal encoding";
1260                 startinpos = s-starts;
1261                 endinpos = startinpos+3;
1262                 goto utf8Error;
1263             }
1264             else
1265                 *p++ = (Py_UNICODE)ch;
1266             break;
1267
1268         case 4:
1269             if ((s[1] & 0xc0) != 0x80 ||
1270                 (s[2] & 0xc0) != 0x80 ||
1271                 (s[3] & 0xc0) != 0x80) {
1272                 errmsg = "invalid data";
1273                 startinpos = s-starts;
1274                 endinpos = startinpos+4;
1275                 goto utf8Error;
1276             }
1277             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279             /* validate and convert to UTF-16 */
1280             if ((ch < 0x10000)        /* minimum value allowed for 4
1281                                          byte encoding */
1282                 || (ch > 0x10ffff))   /* maximum value allowed for
1283                                          UTF-16 */
1284             {
1285                 errmsg = "illegal encoding";
1286                 startinpos = s-starts;
1287                 endinpos = startinpos+4;
1288                 goto utf8Error;
1289             }
1290 #ifdef Py_UNICODE_WIDE
1291             *p++ = (Py_UNICODE)ch;
1292 #else
1293             /*  compute and append the two surrogates: */
1294
1295             /*  translate from 10000..10FFFF to 0..FFFF */
1296             ch -= 0x10000;
1297
1298             /*  high surrogate = top 10 bits added to D800 */
1299             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1300
1301             /*  low surrogate = bottom 10 bits added to DC00 */
1302             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1303 #endif
1304             break;
1305
1306         default:
1307             /* Other sizes are only needed for UCS-4 */
1308             errmsg = "unsupported Unicode code range";
1309             startinpos = s-starts;
1310             endinpos = startinpos+n;
1311             goto utf8Error;
1312         }
1313         s += n;
1314         continue;
1315
1316     utf8Error:
1317     outpos = p-PyUnicode_AS_UNICODE(unicode);
1318     if (unicode_decode_call_errorhandler(
1319              errors, &errorHandler,
1320              "utf8", errmsg,
1321              starts, size, &startinpos, &endinpos, &exc, &s,
1322              (PyObject **)&unicode, &outpos, &p))
1323         goto onError;
1324     }
1325     if (consumed)
1326         *consumed = s-starts;
1327
1328     /* Adjust length */
1329     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1330         goto onError;
1331
1332     Py_XDECREF(errorHandler);
1333     Py_XDECREF(exc);
1334     return (PyObject *)unicode;
1335
1336 onError:
1337     Py_XDECREF(errorHandler);
1338     Py_XDECREF(exc);
1339     Py_DECREF(unicode);
1340     return NULL;
1341 }
1342
1343 /* Allocation strategy:  if the string is short, convert into a stack buffer
1344    and allocate exactly as much space needed at the end.  Else allocate the
1345    maximum possible needed (4 result bytes per Unicode character), and return
1346    the excess memory at the end.
1347 */
1348 PyObject *
1349 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1350                      int size,
1351                      const char *errors)
1352 {
1353 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1354
1355     int i;              /* index into s of next input byte */
1356     PyObject *v;        /* result string object */
1357     char *p;            /* next free byte in output buffer */
1358     int nallocated;     /* number of result bytes allocated */
1359     int nneeded;        /* number of result bytes needed */
1360     char stackbuf[MAX_SHORT_UNICHARS * 4];
1361
1362     assert(s != NULL);
1363     assert(size >= 0);
1364
1365     if (size <= MAX_SHORT_UNICHARS) {
1366         /* Write into the stack buffer; nallocated can't overflow.
1367          * At the end, we'll allocate exactly as much heap space as it
1368          * turns out we need.
1369          */
1370         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371         v = NULL;   /* will allocate after we're done */
1372         p = stackbuf;
1373     }
1374     else {
1375         /* Overallocate on the heap, and give the excess back at the end. */
1376         nallocated = size * 4;
1377         if (nallocated / 4 != size)  /* overflow! */
1378             return PyErr_NoMemory();
1379         v = PyString_FromStringAndSize(NULL, nallocated);
1380         if (v == NULL)
1381             return NULL;
1382         p = PyString_AS_STRING(v);
1383     }
1384
1385     for (i = 0; i < size;) {
1386         Py_UCS4 ch = s[i++];
1387
1388         if (ch < 0x80)
1389             /* Encode ASCII */
1390             *p++ = (char) ch;
1391
1392         else if (ch < 0x0800) {
1393             /* Encode Latin-1 */
1394             *p++ = (char)(0xc0 | (ch >> 6));
1395             *p++ = (char)(0x80 | (ch & 0x3f));
1396         }
1397         else {
1398             /* Encode UCS2 Unicode ordinals */
1399             if (ch < 0x10000) {
1400                 /* Special case: check for high surrogate */
1401                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402                     Py_UCS4 ch2 = s[i];
1403                     /* Check for low surrogate and combine the two to
1404                        form a UCS4 value */
1405                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1406                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1407                         i++;
1408                         goto encodeUCS4;
1409                     }
1410                     /* Fall through: handles isolated high surrogates */
1411                 }
1412                 *p++ = (char)(0xe0 | (ch >> 12));
1413                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414                 *p++ = (char)(0x80 | (ch & 0x3f));
1415                 continue;
1416             }
1417 encodeUCS4:
1418             /* Encode UCS4 Unicode ordinals */
1419             *p++ = (char)(0xf0 | (ch >> 18));
1420             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422             *p++ = (char)(0x80 | (ch & 0x3f));
1423         }
1424     }
1425
1426     if (v == NULL) {
1427         /* This was stack allocated. */
1428         nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1429         assert(nneeded <= nallocated);
1430         v = PyString_FromStringAndSize(stackbuf, nneeded);
1431     }
1432     else {
1433         /* Cut back to size actually needed. */
1434         nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1435         assert(nneeded <= nallocated);
1436         _PyString_Resize(&v, nneeded);
1437     }
1438     return v;
1439
1440 #undef MAX_SHORT_UNICHARS
1441 }
1442
1443 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1444 {
1445     if (!PyUnicode_Check(unicode)) {
1446         PyErr_BadArgument();
1447         return NULL;
1448     }
1449     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450                                 PyUnicode_GET_SIZE(unicode),
1451                                 NULL);
1452 }
1453
1454 /* --- UTF-16 Codec ------------------------------------------------------- */
1455
1456 PyObject *
1457 PyUnicode_DecodeUTF16(const char *s,
1458                       int size,
1459                       const char *errors,
1460                       int *byteorder)
1461 {
1462     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1463 }
1464
1465 PyObject *
1466 PyUnicode_DecodeUTF16Stateful(const char *s,
1467                               int size,
1468                               const char *errors,
1469                               int *byteorder,
1470                               int *consumed)
1471 {
1472     const char *starts = s;
1473     int startinpos;
1474     int endinpos;
1475     int outpos;
1476     PyUnicodeObject *unicode;
1477     Py_UNICODE *p;
1478     const unsigned char *q, *e;
1479     int bo = 0;       /* assume native ordering by default */
1480     const char *errmsg = "";
1481     /* Offsets from q for retrieving byte pairs in the right order. */
1482 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483     int ihi = 1, ilo = 0;
1484 #else
1485     int ihi = 0, ilo = 1;
1486 #endif
1487     PyObject *errorHandler = NULL;
1488     PyObject *exc = NULL;
1489
1490     /* Note: size will always be longer than the resulting Unicode
1491        character count */
1492     unicode = _PyUnicode_New(size);
1493     if (!unicode)
1494         return NULL;
1495     if (size == 0)
1496         return (PyObject *)unicode;
1497
1498     /* Unpack UTF-16 encoded data */
1499     p = unicode->str;
1500     q = (unsigned char *)s;
1501     e = q + size;
1502
1503     if (byteorder)
1504         bo = *byteorder;
1505
1506     /* Check for BOM marks (U+FEFF) in the input and adjust current
1507        byte order setting accordingly. In native mode, the leading BOM
1508        mark is skipped, in all other modes, it is copied to the output
1509        stream as-is (giving a ZWNBSP character). */
1510     if (bo == 0) {
1511         if (size >= 2) {
1512             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1513 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1514             if (bom == 0xFEFF) {
1515                 q += 2;
1516                 bo = -1;
1517             }
1518             else if (bom == 0xFFFE) {
1519                 q += 2;
1520                 bo = 1;
1521             }
1522 #else
1523             if (bom == 0xFEFF) {
1524                 q += 2;
1525                 bo = 1;
1526             }
1527             else if (bom == 0xFFFE) {
1528                 q += 2;
1529                 bo = -1;
1530             }
1531 #endif
1532         }
1533     }
1534
1535     if (bo == -1) {
1536         /* force LE */
1537         ihi = 1;
1538         ilo = 0;
1539     }
1540     else if (bo == 1) {
1541         /* force BE */
1542         ihi = 0;
1543         ilo = 1;
1544     }
1545
1546     while (q < e) {
1547         Py_UNICODE ch;
1548         /* remaining bytes at the end? (size should be even) */
1549         if (e-q<2) {
1550             if (consumed)
1551                 break;
1552             errmsg = "truncated data";
1553             startinpos = ((const char *)q)-starts;
1554             endinpos = ((const char *)e)-starts;
1555             goto utf16Error;
1556             /* The remaining input chars are ignored if the callback
1557                chooses to skip the input */
1558         }
1559         ch = (q[ihi] << 8) | q[ilo];
1560
1561         q += 2;
1562
1563         if (ch < 0xD800 || ch > 0xDFFF) {
1564             *p++ = ch;
1565             continue;
1566         }
1567
1568         /* UTF-16 code pair: */
1569         if (q >= e) {
1570             errmsg = "unexpected end of data";
1571             startinpos = (((const char *)q)-2)-starts;
1572             endinpos = ((const char *)e)-starts;
1573             goto utf16Error;
1574         }
1575         if (0xD800 <= ch && ch <= 0xDBFF) {
1576             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577             q += 2;
1578             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1579 #ifndef Py_UNICODE_WIDE
1580                 *p++ = ch;
1581                 *p++ = ch2;
1582 #else
1583                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1584 #endif
1585                 continue;
1586             }
1587             else {
1588                 errmsg = "illegal UTF-16 surrogate";
1589                 startinpos = (((const char *)q)-4)-starts;
1590                 endinpos = startinpos+2;
1591                 goto utf16Error;
1592             }
1593
1594         }
1595         errmsg = "illegal encoding";
1596         startinpos = (((const char *)q)-2)-starts;
1597         endinpos = startinpos+2;
1598         /* Fall through to report the error */
1599
1600     utf16Error:
1601         outpos = p-PyUnicode_AS_UNICODE(unicode);
1602         if (unicode_decode_call_errorhandler(
1603                  errors, &errorHandler,
1604                  "utf16", errmsg,
1605                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606                  (PyObject **)&unicode, &outpos, &p))
1607             goto onError;
1608     }
1609
1610     if (byteorder)
1611         *byteorder = bo;
1612
1613     if (consumed)
1614         *consumed = (const char *)q-starts;
1615
1616     /* Adjust length */
1617     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1618         goto onError;
1619
1620     Py_XDECREF(errorHandler);
1621     Py_XDECREF(exc);
1622     return (PyObject *)unicode;
1623
1624 onError:
1625     Py_DECREF(unicode);
1626     Py_XDECREF(errorHandler);
1627     Py_XDECREF(exc);
1628     return NULL;
1629 }
1630
1631 PyObject *
1632 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1633                       int size,
1634                       const char *errors,
1635                       int byteorder)
1636 {
1637     PyObject *v;
1638     unsigned char *p;
1639 #ifdef Py_UNICODE_WIDE
1640     int i, pairs;
1641 #else
1642     const int pairs = 0;
1643 #endif
1644     /* Offsets from p for storing byte pairs in the right order. */
1645 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646     int ihi = 1, ilo = 0;
1647 #else
1648     int ihi = 0, ilo = 1;
1649 #endif
1650
1651 #define STORECHAR(CH)                   \
1652     do {                                \
1653         p[ihi] = ((CH) >> 8) & 0xff;    \
1654         p[ilo] = (CH) & 0xff;           \
1655         p += 2;                         \
1656     } while(0)
1657
1658 #ifdef Py_UNICODE_WIDE
1659     for (i = pairs = 0; i < size; i++)
1660         if (s[i] >= 0x10000)
1661             pairs++;
1662 #endif
1663     v = PyString_FromStringAndSize(NULL,
1664                   2 * (size + pairs + (byteorder == 0)));
1665     if (v == NULL)
1666         return NULL;
1667
1668     p = (unsigned char *)PyString_AS_STRING(v);
1669     if (byteorder == 0)
1670         STORECHAR(0xFEFF);
1671     if (size == 0)
1672         return v;
1673
1674     if (byteorder == -1) {
1675         /* force LE */
1676         ihi = 1;
1677         ilo = 0;
1678     }
1679     else if (byteorder == 1) {
1680         /* force BE */
1681         ihi = 0;
1682         ilo = 1;
1683     }
1684
1685     while (size-- > 0) {
1686         Py_UNICODE ch = *s++;
1687         Py_UNICODE ch2 = 0;
1688 #ifdef Py_UNICODE_WIDE
1689         if (ch >= 0x10000) {
1690             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691             ch  = 0xD800 | ((ch-0x10000) >> 10);
1692         }
1693 #endif
1694         STORECHAR(ch);
1695         if (ch2)
1696             STORECHAR(ch2);
1697     }
1698     return v;
1699 #undef STORECHAR
1700 }
1701
1702 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1703 {
1704     if (!PyUnicode_Check(unicode)) {
1705         PyErr_BadArgument();
1706         return NULL;
1707     }
1708     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709                                  PyUnicode_GET_SIZE(unicode),
1710                                  NULL,
1711                                  0);
1712 }
1713
1714 /* --- Unicode Escape Codec ----------------------------------------------- */
1715
1716 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1717
1718 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1719                                         int size,
1720                                         const char *errors)
1721 {
1722     const char *starts = s;
1723     int startinpos;
1724     int endinpos;
1725     int outpos;
1726     int i;
1727     PyUnicodeObject *v;
1728     Py_UNICODE *p;
1729     const char *end;
1730     char* message;
1731     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1732     PyObject *errorHandler = NULL;
1733     PyObject *exc = NULL;
1734
1735     /* Escaped strings will always be longer than the resulting
1736        Unicode string, so we start with size here and then reduce the
1737        length after conversion to the true value.
1738        (but if the error callback returns a long replacement string
1739        we'll have to allocate more space) */
1740     v = _PyUnicode_New(size);
1741     if (v == NULL)
1742         goto onError;
1743     if (size == 0)
1744         return (PyObject *)v;
1745
1746     p = PyUnicode_AS_UNICODE(v);
1747     end = s + size;
1748
1749     while (s < end) {
1750         unsigned char c;
1751         Py_UNICODE x;
1752         int digits;
1753
1754         /* Non-escape characters are interpreted as Unicode ordinals */
1755         if (*s != '\\') {
1756             *p++ = (unsigned char) *s++;
1757             continue;
1758         }
1759
1760         startinpos = s-starts;
1761         /* \ - Escapes */
1762         s++;
1763         switch (*s++) {
1764
1765         /* \x escapes */
1766         case '\n': break;
1767         case '\\': *p++ = '\\'; break;
1768         case '\'': *p++ = '\''; break;
1769         case '\"': *p++ = '\"'; break;
1770         case 'b': *p++ = '\b'; break;
1771         case 'f': *p++ = '\014'; break; /* FF */
1772         case 't': *p++ = '\t'; break;
1773         case 'n': *p++ = '\n'; break;
1774         case 'r': *p++ = '\r'; break;
1775         case 'v': *p++ = '\013'; break; /* VT */
1776         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1777
1778         /* \OOO (octal) escapes */
1779         case '0': case '1': case '2': case '3':
1780         case '4': case '5': case '6': case '7':
1781             x = s[-1] - '0';
1782             if ('0' <= *s && *s <= '7') {
1783                 x = (x<<3) + *s++ - '0';
1784                 if ('0' <= *s && *s <= '7')
1785                     x = (x<<3) + *s++ - '0';
1786             }
1787             *p++ = x;
1788             break;
1789
1790         /* hex escapes */
1791         /* \xXX */
1792         case 'x':
1793             digits = 2;
1794             message = "truncated \\xXX escape";
1795             goto hexescape;
1796
1797         /* \uXXXX */
1798         case 'u':
1799             digits = 4;
1800             message = "truncated \\uXXXX escape";
1801             goto hexescape;
1802
1803         /* \UXXXXXXXX */
1804         case 'U':
1805             digits = 8;
1806             message = "truncated \\UXXXXXXXX escape";
1807         hexescape:
1808             chr = 0;
1809             outpos = p-PyUnicode_AS_UNICODE(v);
1810             if (s+digits>end) {
1811                 endinpos = size;
1812                 if (unicode_decode_call_errorhandler(
1813                     errors, &errorHandler,
1814                     "unicodeescape", "end of string in escape sequence",
1815                     starts, size, &startinpos, &endinpos, &exc, &s,
1816                     (PyObject **)&v, &outpos, &p))
1817                     goto onError;
1818                 goto nextByte;
1819             }
1820             for (i = 0; i < digits; ++i) {
1821                 c = (unsigned char) s[i];
1822                 if (!isxdigit(c)) {
1823                     endinpos = (s+i+1)-starts;
1824                     if (unicode_decode_call_errorhandler(
1825                         errors, &errorHandler,
1826                         "unicodeescape", message,
1827                         starts, size, &startinpos, &endinpos, &exc, &s,
1828                         (PyObject **)&v, &outpos, &p))
1829                         goto onError;
1830                     goto nextByte;
1831                 }
1832                 chr = (chr<<4) & ~0xF;
1833                 if (c >= '0' && c <= '9')
1834                     chr += c - '0';
1835                 else if (c >= 'a' && c <= 'f')
1836                     chr += 10 + c - 'a';
1837                 else
1838                     chr += 10 + c - 'A';
1839             }
1840             s += i;
1841             if (chr == 0xffffffff && PyErr_Occurred())
1842                 /* _decoding_error will have already written into the
1843                    target buffer. */
1844                 break;
1845         store:
1846             /* when we get here, chr is a 32-bit unicode character */
1847             if (chr <= 0xffff)
1848                 /* UCS-2 character */
1849                 *p++ = (Py_UNICODE) chr;
1850             else if (chr <= 0x10ffff) {
1851                 /* UCS-4 character. Either store directly, or as
1852                    surrogate pair. */
1853 #ifdef Py_UNICODE_WIDE
1854                 *p++ = chr;
1855 #else
1856                 chr -= 0x10000L;
1857                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1858                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1859 #endif
1860             } else {
1861                 endinpos = s-starts;
1862                 outpos = p-PyUnicode_AS_UNICODE(v);
1863                 if (unicode_decode_call_errorhandler(
1864                     errors, &errorHandler,
1865                     "unicodeescape", "illegal Unicode character",
1866                     starts, size, &startinpos, &endinpos, &exc, &s,
1867                     (PyObject **)&v, &outpos, &p))
1868                     goto onError;
1869             }
1870             break;
1871
1872         /* \N{name} */
1873         case 'N':
1874             message = "malformed \\N character escape";
1875             if (ucnhash_CAPI == NULL) {
1876                 /* load the unicode data module */
1877                 PyObject *m, *v;
1878                 m = PyImport_ImportModule("unicodedata");
1879                 if (m == NULL)
1880                     goto ucnhashError;
1881                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1882                 Py_DECREF(m);
1883                 if (v == NULL)
1884                     goto ucnhashError;
1885                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1886                 Py_DECREF(v);
1887                 if (ucnhash_CAPI == NULL)
1888                     goto ucnhashError;
1889             }
1890             if (*s == '{') {
1891                 const char *start = s+1;
1892                 /* look for the closing brace */
1893                 while (*s != '}' && s < end)
1894                     s++;
1895                 if (s > start && s < end && *s == '}') {
1896                     /* found a name.  look it up in the unicode database */
1897                     message = "unknown Unicode character name";
1898                     s++;
1899                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1900                         goto store;
1901                 }
1902             }
1903             endinpos = s-starts;
1904             outpos = p-PyUnicode_AS_UNICODE(v);
1905             if (unicode_decode_call_errorhandler(
1906                 errors, &errorHandler,
1907                 "unicodeescape", message,
1908                 starts, size, &startinpos, &endinpos, &exc, &s,
1909                 (PyObject **)&v, &outpos, &p))
1910                 goto onError;
1911             break;
1912
1913         default:
1914             if (s > end) {
1915                 message = "\\ at end of string";
1916                 s--;
1917                 endinpos = s-starts;
1918                 outpos = p-PyUnicode_AS_UNICODE(v);
1919                 if (unicode_decode_call_errorhandler(
1920                     errors, &errorHandler,
1921                     "unicodeescape", message,
1922                     starts, size, &startinpos, &endinpos, &exc, &s,
1923                     (PyObject **)&v, &outpos, &p))
1924                     goto onError;
1925             }
1926             else {
1927                 *p++ = '\\';
1928                 *p++ = (unsigned char)s[-1];
1929             }
1930             break;
1931         }
1932         nextByte:
1933         ;
1934     }
1935     if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
1936         goto onError;
1937     Py_XDECREF(errorHandler);
1938     Py_XDECREF(exc);
1939     return (PyObject *)v;
1940
1941 ucnhashError:
1942     PyErr_SetString(
1943         PyExc_UnicodeError,
1944         "\\N escapes not supported (can't load unicodedata module)"
1945         );
1946     Py_XDECREF(errorHandler);
1947     Py_XDECREF(exc);
1948     return NULL;
1949
1950 onError:
1951     Py_XDECREF(v);
1952     Py_XDECREF(errorHandler);
1953     Py_XDECREF(exc);
1954     return NULL;
1955 }
1956
1957 /* Return a Unicode-Escape string version of the Unicode object.
1958
1959    If quotes is true, the string is enclosed in u"" or u'' quotes as
1960    appropriate.
1961
1962 */
1963
1964 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1965                                   int size,
1966                                   Py_UNICODE ch);
1967
1968 static
1969 PyObject *unicodeescape_string(const Py_UNICODE *s,
1970                                int size,
1971                                int quotes)
1972 {
1973     PyObject *repr;
1974     char *p;
1975
1976     static const char *hexdigit = "0123456789abcdef";
1977
1978     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1979     if (repr == NULL)
1980         return NULL;
1981
1982     p = PyString_AS_STRING(repr);
1983
1984     if (quotes) {
1985         *p++ = 'u';
1986         *p++ = (findchar(s, size, '\'') &&
1987                 !findchar(s, size, '"')) ? '"' : '\'';
1988     }
1989     while (size-- > 0) {
1990         Py_UNICODE ch = *s++;
1991
1992         /* Escape quotes */
1993         if (quotes &&
1994             (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1995             *p++ = '\\';
1996             *p++ = (char) ch;
1997             continue;
1998         }
1999
2000 #ifdef Py_UNICODE_WIDE
2001         /* Map 21-bit characters to '\U00xxxxxx' */
2002         else if (ch >= 0x10000) {
2003             int offset = p - PyString_AS_STRING(repr);
2004
2005             /* Resize the string if necessary */
2006             if (offset + 12 > PyString_GET_SIZE(repr)) {
2007                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
2008                     return NULL;
2009                 p = PyString_AS_STRING(repr) + offset;
2010             }
2011
2012             *p++ = '\\';
2013             *p++ = 'U';
2014             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2015             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2016             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2017             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2018             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2019             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2020             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2021             *p++ = hexdigit[ch & 0x0000000F];
2022             continue;
2023         }
2024 #endif
2025         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026         else if (ch >= 0xD800 && ch < 0xDC00) {
2027             Py_UNICODE ch2;
2028             Py_UCS4 ucs;
2029
2030             ch2 = *s++;
2031             size--;
2032             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2033                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2034                 *p++ = '\\';
2035                 *p++ = 'U';
2036                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2037                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2038                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2039                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2040                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2041                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2042                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2043                 *p++ = hexdigit[ucs & 0x0000000F];
2044                 continue;
2045             }
2046             /* Fall through: isolated surrogates are copied as-is */
2047             s--;
2048             size++;
2049         }
2050
2051         /* Map 16-bit characters to '\uxxxx' */
2052         if (ch >= 256) {
2053             *p++ = '\\';
2054             *p++ = 'u';
2055             *p++ = hexdigit[(ch >> 12) & 0x000F];
2056             *p++ = hexdigit[(ch >> 8) & 0x000F];
2057             *p++ = hexdigit[(ch >> 4) & 0x000F];
2058             *p++ = hexdigit[ch & 0x000F];
2059         }
2060
2061         /* Map special whitespace to '\t', \n', '\r' */
2062         else if (ch == '\t') {
2063             *p++ = '\\';
2064             *p++ = 't';
2065         }
2066         else if (ch == '\n') {
2067             *p++ = '\\';
2068             *p++ = 'n';
2069         }
2070         else if (ch == '\r') {
2071             *p++ = '\\';
2072             *p++ = 'r';
2073         }
2074
2075         /* Map non-printable US ASCII to '\xhh' */
2076         else if (ch < ' ' || ch >= 0x7F) {
2077             *p++ = '\\';
2078             *p++ = 'x';
2079             *p++ = hexdigit[(ch >> 4) & 0x000F];
2080             *p++ = hexdigit[ch & 0x000F];
2081         }
2082
2083         /* Copy everything else as-is */
2084         else
2085             *p++ = (char) ch;
2086     }
2087     if (quotes)
2088         *p++ = PyString_AS_STRING(repr)[1];
2089
2090     *p = '\0';
2091     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2092     return repr;
2093 }
2094
2095 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2096                                         int size)
2097 {
2098     return unicodeescape_string(s, size, 0);
2099 }
2100
2101 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2102 {
2103     if (!PyUnicode_Check(unicode)) {
2104         PyErr_BadArgument();
2105         return NULL;
2106     }
2107     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2108                                          PyUnicode_GET_SIZE(unicode));
2109 }
2110
2111 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2112
2113 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2114                                            int size,
2115                                            const char *errors)
2116 {
2117     const char *starts = s;
2118     int startinpos;
2119     int endinpos;
2120     int outpos;
2121     PyUnicodeObject *v;
2122     Py_UNICODE *p;
2123     const char *end;
2124     const char *bs;
2125     PyObject *errorHandler = NULL;
2126     PyObject *exc = NULL;
2127
2128     /* Escaped strings will always be longer than the resulting
2129        Unicode string, so we start with size here and then reduce the
2130        length after conversion to the true value. (But decoding error
2131        handler might have to resize the string) */
2132     v = _PyUnicode_New(size);
2133     if (v == NULL)
2134         goto onError;
2135     if (size == 0)
2136         return (PyObject *)v;
2137     p = PyUnicode_AS_UNICODE(v);
2138     end = s + size;
2139     while (s < end) {
2140         unsigned char c;
2141         Py_UCS4 x;
2142         int i;
2143         int count;
2144
2145         /* Non-escape characters are interpreted as Unicode ordinals */
2146         if (*s != '\\') {
2147             *p++ = (unsigned char)*s++;
2148             continue;
2149         }
2150         startinpos = s-starts;
2151
2152         /* \u-escapes are only interpreted iff the number of leading
2153            backslashes if odd */
2154         bs = s;
2155         for (;s < end;) {
2156             if (*s != '\\')
2157                 break;
2158             *p++ = (unsigned char)*s++;
2159         }
2160         if (((s - bs) & 1) == 0 ||
2161             s >= end ||
2162             (*s != 'u' && *s != 'U')) {
2163             continue;
2164         }
2165         p--;
2166         count = *s=='u' ? 4 : 8;
2167         s++;
2168
2169         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2170         outpos = p-PyUnicode_AS_UNICODE(v);
2171         for (x = 0, i = 0; i < count; ++i, ++s) {
2172             c = (unsigned char)*s;
2173             if (!isxdigit(c)) {
2174                 endinpos = s-starts;
2175                 if (unicode_decode_call_errorhandler(
2176                     errors, &errorHandler,
2177                     "rawunicodeescape", "truncated \\uXXXX",
2178                     starts, size, &startinpos, &endinpos, &exc, &s,
2179                     (PyObject **)&v, &outpos, &p))
2180                     goto onError;
2181                 goto nextByte;
2182             }
2183             x = (x<<4) & ~0xF;
2184             if (c >= '0' && c <= '9')
2185                 x += c - '0';
2186             else if (c >= 'a' && c <= 'f')
2187                 x += 10 + c - 'a';
2188             else
2189                 x += 10 + c - 'A';
2190         }
2191 #ifndef Py_UNICODE_WIDE
2192         if (x > 0x10000) {
2193             if (unicode_decode_call_errorhandler(
2194                     errors, &errorHandler,
2195                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196                     starts, size, &startinpos, &endinpos, &exc, &s,
2197                     (PyObject **)&v, &outpos, &p))
2198                     goto onError;
2199         }
2200 #endif
2201         *p++ = x;
2202         nextByte:
2203         ;
2204     }
2205     if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2206         goto onError;
2207     Py_XDECREF(errorHandler);
2208     Py_XDECREF(exc);
2209     return (PyObject *)v;
2210
2211  onError:
2212     Py_XDECREF(v);
2213     Py_XDECREF(errorHandler);
2214     Py_XDECREF(exc);
2215     return NULL;
2216 }
2217
2218 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2219                                            int size)
2220 {
2221     PyObject *repr;
2222     char *p;
2223     char *q;
2224
2225     static const char *hexdigit = "0123456789abcdef";
2226
2227 #ifdef Py_UNICODE_WIDE
2228     repr = PyString_FromStringAndSize(NULL, 10 * size);
2229 #else
2230     repr = PyString_FromStringAndSize(NULL, 6 * size);
2231 #endif
2232     if (repr == NULL)
2233         return NULL;
2234     if (size == 0)
2235         return repr;
2236
2237     p = q = PyString_AS_STRING(repr);
2238     while (size-- > 0) {
2239         Py_UNICODE ch = *s++;
2240 #ifdef Py_UNICODE_WIDE
2241         /* Map 32-bit characters to '\Uxxxxxxxx' */
2242         if (ch >= 0x10000) {
2243             *p++ = '\\';
2244             *p++ = 'U';
2245             *p++ = hexdigit[(ch >> 28) & 0xf];
2246             *p++ = hexdigit[(ch >> 24) & 0xf];
2247             *p++ = hexdigit[(ch >> 20) & 0xf];
2248             *p++ = hexdigit[(ch >> 16) & 0xf];
2249             *p++ = hexdigit[(ch >> 12) & 0xf];
2250             *p++ = hexdigit[(ch >> 8) & 0xf];
2251             *p++ = hexdigit[(ch >> 4) & 0xf];
2252             *p++ = hexdigit[ch & 15];
2253         }
2254         else
2255 #endif
2256         /* Map 16-bit characters to '\uxxxx' */
2257         if (ch >= 256) {
2258             *p++ = '\\';
2259             *p++ = 'u';
2260             *p++ = hexdigit[(ch >> 12) & 0xf];
2261             *p++ = hexdigit[(ch >> 8) & 0xf];
2262             *p++ = hexdigit[(ch >> 4) & 0xf];
2263             *p++ = hexdigit[ch & 15];
2264         }
2265         /* Copy everything else as-is */
2266         else
2267             *p++ = (char) ch;
2268     }
2269     *p = '\0';
2270     _PyString_Resize(&repr, p - q);
2271     return repr;
2272 }
2273
2274 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2275 {
2276     if (!PyUnicode_Check(unicode)) {
2277         PyErr_BadArgument();
2278         return NULL;
2279     }
2280     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2281                                             PyUnicode_GET_SIZE(unicode));
2282 }
2283
2284 /* --- Unicode Internal Codec ------------------------------------------- */
2285
2286 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2287                                            int size,
2288                                            const char *errors)
2289 {
2290     const char *starts = s;
2291     int startinpos;
2292     int endinpos;
2293     int outpos;
2294     Py_UNICODE unimax;
2295     PyUnicodeObject *v;
2296     Py_UNICODE *p;
2297     const char *end;
2298     const char *reason;
2299     PyObject *errorHandler = NULL;
2300     PyObject *exc = NULL;
2301
2302     unimax = PyUnicode_GetMax();
2303     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2304     if (v == NULL)
2305         goto onError;
2306     if (PyUnicode_GetSize((PyObject *)v) == 0)
2307         return (PyObject *)v;
2308     p = PyUnicode_AS_UNICODE(v);
2309     end = s + size;
2310
2311     while (s < end) {
2312         *p = *(Py_UNICODE *)s;
2313         /* We have to sanity check the raw data, otherwise doom looms for
2314            some malformed UCS-4 data. */
2315         if (
2316             #ifdef Py_UNICODE_WIDE
2317             *p > unimax || *p < 0 ||
2318             #endif
2319             end-s < Py_UNICODE_SIZE
2320             )
2321             {
2322             startinpos = s - starts;
2323             if (end-s < Py_UNICODE_SIZE) {
2324                 endinpos = end-starts;
2325                 reason = "truncated input";
2326             }
2327             else {
2328                 endinpos = s - starts + Py_UNICODE_SIZE;
2329                 reason = "illegal code point (> 0x10FFFF)";
2330             }
2331             outpos = p - PyUnicode_AS_UNICODE(v);
2332             if (unicode_decode_call_errorhandler(
2333                     errors, &errorHandler,
2334                     "unicode_internal", reason,
2335                     starts, size, &startinpos, &endinpos, &exc, &s,
2336                     (PyObject **)&v, &outpos, &p)) {
2337                 goto onError;
2338             }
2339         }
2340         else {
2341             p++;
2342             s += Py_UNICODE_SIZE;
2343         }
2344     }
2345
2346     if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2347         goto onError;
2348     Py_XDECREF(errorHandler);
2349     Py_XDECREF(exc);
2350     return (PyObject *)v;
2351
2352  onError:
2353     Py_XDECREF(v);
2354     Py_XDECREF(errorHandler);
2355     Py_XDECREF(exc);
2356     return NULL;
2357 }
2358
2359 /* --- Latin-1 Codec ------------------------------------------------------ */
2360
2361 PyObject *PyUnicode_DecodeLatin1(const char *s,
2362                                  int size,
2363                                  const char *errors)
2364 {
2365     PyUnicodeObject *v;
2366     Py_UNICODE *p;
2367
2368     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2369     if (size == 1) {
2370         Py_UNICODE r = *(unsigned char*)s;
2371         return PyUnicode_FromUnicode(&r, 1);
2372     }
2373
2374     v = _PyUnicode_New(size);
2375     if (v == NULL)
2376         goto onError;
2377     if (size == 0)
2378         return (PyObject *)v;
2379     p = PyUnicode_AS_UNICODE(v);
2380     while (size-- > 0)
2381         *p++ = (unsigned char)*s++;
2382     return (PyObject *)v;
2383
2384  onError:
2385     Py_XDECREF(v);
2386     return NULL;
2387 }
2388
2389 /* create or adjust a UnicodeEncodeError */
2390 static void make_encode_exception(PyObject **exceptionObject,
2391     const char *encoding,
2392     const Py_UNICODE *unicode, int size,
2393     int startpos, int endpos,
2394     const char *reason)
2395 {
2396     if (*exceptionObject == NULL) {
2397         *exceptionObject = PyUnicodeEncodeError_Create(
2398             encoding, unicode, size, startpos, endpos, reason);
2399     }
2400     else {
2401         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2402             goto onError;
2403         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2404             goto onError;
2405         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2406             goto onError;
2407         return;
2408         onError:
2409         Py_DECREF(*exceptionObject);
2410         *exceptionObject = NULL;
2411     }
2412 }
2413
2414 /* raises a UnicodeEncodeError */
2415 static void raise_encode_exception(PyObject **exceptionObject,
2416     const char *encoding,
2417     const Py_UNICODE *unicode, int size,
2418     int startpos, int endpos,
2419     const char *reason)
2420 {
2421     make_encode_exception(exceptionObject,
2422         encoding, unicode, size, startpos, endpos, reason);
2423     if (*exceptionObject != NULL)
2424         PyCodec_StrictErrors(*exceptionObject);
2425 }
2426
2427 /* error handling callback helper:
2428    build arguments, call the callback and check the arguments,
2429    put the result into newpos and return the replacement string, which
2430    has to be freed by the caller */
2431 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2432     PyObject **errorHandler,
2433     const char *encoding, const char *reason,
2434     const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2435     int startpos, int endpos,
2436     int *newpos)
2437 {
2438     static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2439
2440     PyObject *restuple;
2441     PyObject *resunicode;
2442
2443     if (*errorHandler == NULL) {
2444         *errorHandler = PyCodec_LookupError(errors);
2445         if (*errorHandler == NULL)
2446             return NULL;
2447     }
2448
2449     make_encode_exception(exceptionObject,
2450         encoding, unicode, size, startpos, endpos, reason);
2451     if (*exceptionObject == NULL)
2452         return NULL;
2453
2454     restuple = PyObject_CallFunctionObjArgs(
2455         *errorHandler, *exceptionObject, NULL);
2456     if (restuple == NULL)
2457         return NULL;
2458     if (!PyTuple_Check(restuple)) {
2459         PyErr_Format(PyExc_TypeError, &argparse[4]);
2460         Py_DECREF(restuple);
2461         return NULL;
2462     }
2463     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2464         &resunicode, newpos)) {
2465         Py_DECREF(restuple);
2466         return NULL;
2467     }
2468     if (*newpos<0)
2469         *newpos = size+*newpos;
2470     if (*newpos<0 || *newpos>size) {
2471         PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2472         Py_DECREF(restuple);
2473         return NULL;
2474     }
2475     Py_INCREF(resunicode);
2476     Py_DECREF(restuple);
2477     return resunicode;
2478 }
2479
2480 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2481                                  int size,
2482                                  const char *errors,
2483                                  int limit)
2484 {
2485     /* output object */
2486     PyObject *res;
2487     /* pointers to the beginning and end+1 of input */
2488     const Py_UNICODE *startp = p;
2489     const Py_UNICODE *endp = p + size;
2490     /* pointer to the beginning of the unencodable characters */
2491     /* const Py_UNICODE *badp = NULL; */
2492     /* pointer into the output */
2493     char *str;
2494     /* current output position */
2495     int respos = 0;
2496     int ressize;
2497     char *encoding = (limit == 256) ? "latin-1" : "ascii";
2498     char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2499     PyObject *errorHandler = NULL;
2500     PyObject *exc = NULL;
2501     /* the following variable is used for caching string comparisons
2502      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2503     int known_errorHandler = -1;
2504
2505     /* allocate enough for a simple encoding without
2506        replacements, if we need more, we'll resize */
2507     res = PyString_FromStringAndSize(NULL, size);
2508     if (res == NULL)
2509         goto onError;
2510     if (size == 0)
2511         return res;
2512     str = PyString_AS_STRING(res);
2513     ressize = size;
2514
2515     while (p<endp) {
2516         Py_UNICODE c = *p;
2517
2518         /* can we encode this? */
2519         if (c<limit) {
2520             /* no overflow check, because we know that the space is enough */
2521             *str++ = (char)c;
2522             ++p;
2523         }
2524         else {
2525             int unicodepos = p-startp;
2526             int requiredsize;
2527             PyObject *repunicode;
2528             int repsize;
2529             int newpos;
2530             int respos;
2531             Py_UNICODE *uni2;
2532             /* startpos for collecting unencodable chars */
2533             const Py_UNICODE *collstart = p;
2534             const Py_UNICODE *collend = p;
2535             /* find all unecodable characters */
2536             while ((collend < endp) && ((*collend)>=limit))
2537                 ++collend;
2538             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2539             if (known_errorHandler==-1) {
2540                 if ((errors==NULL) || (!strcmp(errors, "strict")))
2541                     known_errorHandler = 1;
2542                 else if (!strcmp(errors, "replace"))
2543                     known_errorHandler = 2;
2544                 else if (!strcmp(errors, "ignore"))
2545                     known_errorHandler = 3;
2546                 else if (!strcmp(errors, "xmlcharrefreplace"))
2547                     known_errorHandler = 4;
2548                 else
2549                     known_errorHandler = 0;
2550             }
2551             switch (known_errorHandler) {
2552                 case 1: /* strict */
2553                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2554                     goto onError;
2555                 case 2: /* replace */
2556                     while (collstart++<collend)
2557                         *str++ = '?'; /* fall through */
2558                 case 3: /* ignore */
2559                     p = collend;
2560                     break;
2561                 case 4: /* xmlcharrefreplace */
2562                     respos = str-PyString_AS_STRING(res);
2563                     /* determine replacement size (temporarily (mis)uses p) */
2564                     for (p = collstart, repsize = 0; p < collend; ++p) {
2565                         if (*p<10)
2566                             repsize += 2+1+1;
2567                         else if (*p<100)
2568                             repsize += 2+2+1;
2569                         else if (*p<1000)
2570                             repsize += 2+3+1;
2571                         else if (*p<10000)
2572                             repsize += 2+4+1;
2573 #ifndef Py_UNICODE_WIDE
2574                         else
2575                             repsize += 2+5+1;
2576 #else
2577                         else if (*p<100000)
2578                             repsize += 2+5+1;
2579                         else if (*p<1000000)
2580                             repsize += 2+6+1;
2581                         else
2582                             repsize += 2+7+1;
2583 #endif
2584                     }
2585                     requiredsize = respos+repsize+(endp-collend);
2586                     if (requiredsize > ressize) {
2587                         if (requiredsize<2*ressize)
2588                             requiredsize = 2*ressize;
2589                         if (_PyString_Resize(&res, requiredsize))
2590                             goto onError;
2591                         str = PyString_AS_STRING(res) + respos;
2592                         ressize = requiredsize;
2593                     }
2594                     /* generate replacement (temporarily (mis)uses p) */
2595                     for (p = collstart; p < collend; ++p) {
2596                         str += sprintf(str, "&#%d;", (int)*p);
2597                     }
2598                     p = collend;
2599                     break;
2600                 default:
2601                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2602                         encoding, reason, startp, size, &exc,
2603                         collstart-startp, collend-startp, &newpos);
2604                     if (repunicode == NULL)
2605                         goto onError;
2606                     /* need more space? (at least enough for what we
2607                        have+the replacement+the rest of the string, so
2608                        we won't have to check space for encodable characters) */
2609                     respos = str-PyString_AS_STRING(res);
2610                     repsize = PyUnicode_GET_SIZE(repunicode);
2611                     requiredsize = respos+repsize+(endp-collend);
2612                     if (requiredsize > ressize) {
2613                         if (requiredsize<2*ressize)
2614                             requiredsize = 2*ressize;
2615                         if (_PyString_Resize(&res, requiredsize)) {
2616                             Py_DECREF(repunicode);
2617                             goto onError;
2618                         }
2619                         str = PyString_AS_STRING(res) + respos;
2620                         ressize = requiredsize;
2621                     }
2622                     /* check if there is anything unencodable in the replacement
2623                        and copy it to the output */
2624                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2625                         c = *uni2;
2626                         if (c >= limit) {
2627                             raise_encode_exception(&exc, encoding, startp, size,
2628                                 unicodepos, unicodepos+1, reason);
2629                             Py_DECREF(repunicode);
2630                             goto onError;
2631                         }
2632                         *str = (char)c;
2633                     }
2634                     p = startp + newpos;
2635                     Py_DECREF(repunicode);
2636             }
2637         }
2638     }
2639     /* Resize if we allocated to much */
2640     respos = str-PyString_AS_STRING(res);
2641     if (respos<ressize)
2642        /* If this falls res will be NULL */
2643         _PyString_Resize(&res, respos);
2644     Py_XDECREF(errorHandler);
2645     Py_XDECREF(exc);
2646     return res;
2647
2648     onError:
2649     Py_XDECREF(res);
2650     Py_XDECREF(errorHandler);
2651     Py_XDECREF(exc);
2652     return NULL;
2653 }
2654
2655 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2656                                  int size,
2657                                  const char *errors)
2658 {
2659     return unicode_encode_ucs1(p, size, errors, 256);
2660 }
2661
2662 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2663 {
2664     if (!PyUnicode_Check(unicode)) {
2665         PyErr_BadArgument();
2666         return NULL;
2667     }
2668     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2669                                   PyUnicode_GET_SIZE(unicode),
2670                                   NULL);
2671 }
2672
2673 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2674
2675 PyObject *PyUnicode_DecodeASCII(const char *s,
2676                                 int size,
2677                                 const char *errors)
2678 {
2679     const char *starts = s;
2680     PyUnicodeObject *v;
2681     Py_UNICODE *p;
2682     int startinpos;
2683     int endinpos;
2684     int outpos;
2685     const char *e;
2686     PyObject *errorHandler = NULL;
2687     PyObject *exc = NULL;
2688
2689     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2690     if (size == 1 && *(unsigned char*)s < 128) {
2691         Py_UNICODE r = *(unsigned char*)s;
2692         return PyUnicode_FromUnicode(&r, 1);
2693     }
2694
2695     v = _PyUnicode_New(size);
2696     if (v == NULL)
2697         goto onError;
2698     if (size == 0)
2699         return (PyObject *)v;
2700     p = PyUnicode_AS_UNICODE(v);
2701     e = s + size;
2702     while (s < e) {
2703         register unsigned char c = (unsigned char)*s;
2704         if (c < 128) {
2705             *p++ = c;
2706             ++s;
2707         }
2708         else {
2709             startinpos = s-starts;
2710             endinpos = startinpos + 1;
2711             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2712             if (unicode_decode_call_errorhandler(
2713                  errors, &errorHandler,
2714                  "ascii", "ordinal not in range(128)",
2715                  starts, size, &startinpos, &endinpos, &exc, &s,
2716                  (PyObject **)&v, &outpos, &p))
2717                 goto onError;
2718         }
2719     }
2720     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2721         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2722             goto onError;
2723     Py_XDECREF(errorHandler);
2724     Py_XDECREF(exc);
2725     return (PyObject *)v;
2726
2727  onError:
2728     Py_XDECREF(v);
2729     Py_XDECREF(errorHandler);
2730     Py_XDECREF(exc);
2731     return NULL;
2732 }
2733
2734 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2735                                 int size,
2736                                 const char *errors)
2737 {
2738     return unicode_encode_ucs1(p, size, errors, 128);
2739 }
2740
2741 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2742 {
2743     if (!PyUnicode_Check(unicode)) {
2744         PyErr_BadArgument();
2745         return NULL;
2746     }
2747     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2748                                  PyUnicode_GET_SIZE(unicode),
2749                                  NULL);
2750 }
2751
2752 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2753
2754 /* --- MBCS codecs for Windows -------------------------------------------- */
2755
2756 PyObject *PyUnicode_DecodeMBCS(const char *s,
2757                                 int size,
2758                                 const char *errors)
2759 {
2760     PyUnicodeObject *v;
2761     Py_UNICODE *p;
2762
2763     /* First get the size of the result */
2764     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2765     if (size > 0 && usize==0)
2766         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2767
2768     v = _PyUnicode_New(usize);
2769     if (v == NULL)
2770         return NULL;
2771     if (usize == 0)
2772         return (PyObject *)v;
2773     p = PyUnicode_AS_UNICODE(v);
2774     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2775         Py_DECREF(v);
2776         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2777     }
2778
2779     return (PyObject *)v;
2780 }
2781
2782 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2783                                 int size,
2784                                 const char *errors)
2785 {
2786     PyObject *repr;
2787     char *s;
2788     DWORD mbcssize;
2789
2790     /* If there are no characters, bail now! */
2791     if (size==0)
2792             return PyString_FromString("");
2793
2794     /* First get the size of the result */
2795     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2796     if (mbcssize==0)
2797         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2798
2799     repr = PyString_FromStringAndSize(NULL, mbcssize);
2800     if (repr == NULL)
2801         return NULL;
2802     if (mbcssize == 0)
2803         return repr;
2804
2805     /* Do the conversion */
2806     s = PyString_AS_STRING(repr);
2807     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2808         Py_DECREF(repr);
2809         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2810     }
2811     return repr;
2812 }
2813
2814 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2815 {
2816     if (!PyUnicode_Check(unicode)) {
2817         PyErr_BadArgument();
2818         return NULL;
2819     }
2820     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821                                 PyUnicode_GET_SIZE(unicode),
2822                                 NULL);
2823 }
2824
2825 #endif /* MS_WINDOWS */
2826
2827 /* --- Character Mapping Codec -------------------------------------------- */
2828
2829 PyObject *PyUnicode_DecodeCharmap(const char *s,
2830                                   int size,
2831                                   PyObject *mapping,
2832                                   const char *errors)
2833 {
2834     const char *starts = s;
2835     int startinpos;
2836     int endinpos;
2837     int outpos;
2838     const char *e;
2839     PyUnicodeObject *v;
2840     Py_UNICODE *p;
2841     int extrachars = 0;
2842     PyObject *errorHandler = NULL;
2843     PyObject *exc = NULL;
2844     Py_UNICODE *mapstring = NULL;
2845     int maplen = 0;
2846
2847     /* Default to Latin-1 */
2848     if (mapping == NULL)
2849         return PyUnicode_DecodeLatin1(s, size, errors);
2850
2851     v = _PyUnicode_New(size);
2852     if (v == NULL)
2853         goto onError;
2854     if (size == 0)
2855         return (PyObject *)v;
2856     p = PyUnicode_AS_UNICODE(v);
2857     e = s + size;
2858     if (PyUnicode_CheckExact(mapping)) {
2859         mapstring = PyUnicode_AS_UNICODE(mapping);
2860         maplen = PyUnicode_GET_SIZE(mapping);
2861         while (s < e) {
2862             unsigned char ch = *s;
2863             Py_UNICODE x = 0xfffe; /* illegal value */
2864
2865             if (ch < maplen)
2866                 x = mapstring[ch];
2867
2868             if (x == 0xfffe) {
2869                 /* undefined mapping */
2870                 outpos = p-PyUnicode_AS_UNICODE(v);
2871                 startinpos = s-starts;
2872                 endinpos = startinpos+1;
2873                 if (unicode_decode_call_errorhandler(
2874                      errors, &errorHandler,
2875                      "charmap", "character maps to <undefined>",
2876                      starts, size, &startinpos, &endinpos, &exc, &s,
2877                      (PyObject **)&v, &outpos, &p)) {
2878                     goto onError;
2879                 }
2880                 continue;
2881             }
2882             *p++ = x;
2883             ++s;
2884         }
2885     }
2886     else {
2887         while (s < e) {
2888             unsigned char ch = *s;
2889             PyObject *w, *x;
2890
2891             /* Get mapping (char ordinal -> integer, Unicode char or None) */
2892             w = PyInt_FromLong((long)ch);
2893             if (w == NULL)
2894                 goto onError;
2895             x = PyObject_GetItem(mapping, w);
2896             Py_DECREF(w);
2897             if (x == NULL) {
2898                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2899                     /* No mapping found means: mapping is undefined. */
2900                     PyErr_Clear();
2901                     x = Py_None;
2902                     Py_INCREF(x);
2903                 } else
2904                     goto onError;
2905             }
2906
2907             /* Apply mapping */
2908             if (PyInt_Check(x)) {
2909                 long value = PyInt_AS_LONG(x);
2910                 if (value < 0 || value > 65535) {
2911                     PyErr_SetString(PyExc_TypeError,
2912                                     "character mapping must be in range(65536)");
2913                     Py_DECREF(x);
2914                     goto onError;
2915                 }
2916                 *p++ = (Py_UNICODE)value;
2917             }
2918             else if (x == Py_None) {
2919                 /* undefined mapping */
2920                 outpos = p-PyUnicode_AS_UNICODE(v);
2921                 startinpos = s-starts;
2922                 endinpos = startinpos+1;
2923                 if (unicode_decode_call_errorhandler(
2924                      errors, &errorHandler,
2925                      "charmap", "character maps to <undefined>",
2926                      starts, size, &startinpos, &endinpos, &exc, &s,
2927                      (PyObject **)&v, &outpos, &p)) {
2928                     Py_DECREF(x);
2929                     goto onError;
2930                 }
2931                 continue;
2932             }
2933             else if (PyUnicode_Check(x)) {
2934                 int targetsize = PyUnicode_GET_SIZE(x);
2935
2936                 if (targetsize == 1)
2937                     /* 1-1 mapping */
2938                     *p++ = *PyUnicode_AS_UNICODE(x);
2939
2940                 else if (targetsize > 1) {
2941                     /* 1-n mapping */
2942                     if (targetsize > extrachars) {
2943                         /* resize first */
2944                         int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2945                         int needed = (targetsize - extrachars) + \
2946                                      (targetsize << 2);
2947                         extrachars += needed;
2948                         if (_PyUnicode_Resize(&v,
2949                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
2950                             Py_DECREF(x);
2951                             goto onError;
2952                         }
2953                         p = PyUnicode_AS_UNICODE(v) + oldpos;
2954                     }
2955                     Py_UNICODE_COPY(p,
2956                                     PyUnicode_AS_UNICODE(x),
2957                                     targetsize);
2958                     p += targetsize;
2959                     extrachars -= targetsize;
2960                 }
2961                 /* 1-0 mapping: skip the character */
2962             }
2963             else {
2964                 /* wrong return value */
2965                 PyErr_SetString(PyExc_TypeError,
2966                       "character mapping must return integer, None or unicode");
2967                 Py_DECREF(x);
2968                 goto onError;
2969             }
2970             Py_DECREF(x);
2971             ++s;
2972         }
2973     }
2974     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2975         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2976             goto onError;
2977     Py_XDECREF(errorHandler);
2978     Py_XDECREF(exc);
2979     return (PyObject *)v;
2980
2981  onError:
2982     Py_XDECREF(errorHandler);
2983     Py_XDECREF(exc);
2984     Py_XDECREF(v);
2985     return NULL;
2986 }
2987
2988 /* Lookup the character ch in the mapping. If the character
2989    can't be found, Py_None is returned (or NULL, if another
2990    error occurred). */
2991 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2992 {
2993     PyObject *w = PyInt_FromLong((long)c);
2994     PyObject *x;
2995
2996     if (w == NULL)
2997          return NULL;
2998     x = PyObject_GetItem(mapping, w);
2999     Py_DECREF(w);
3000     if (x == NULL) {
3001         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3002             /* No mapping found means: mapping is undefined. */
3003             PyErr_Clear();
3004             x = Py_None;
3005             Py_INCREF(x);
3006             return x;
3007         } else
3008             return NULL;
3009     }
3010     else if (x == Py_None)
3011         return x;
3012     else if (PyInt_Check(x)) {
3013         long value = PyInt_AS_LONG(x);
3014         if (value < 0 || value > 255) {
3015             PyErr_SetString(PyExc_TypeError,
3016                              "character mapping must be in range(256)");
3017             Py_DECREF(x);
3018             return NULL;
3019         }
3020         return x;
3021     }
3022     else if (PyString_Check(x))
3023         return x;
3024     else {
3025         /* wrong return value */
3026         PyErr_SetString(PyExc_TypeError,
3027               "character mapping must return integer, None or str");
3028         Py_DECREF(x);
3029         return NULL;
3030     }
3031 }
3032
3033 /* lookup the character, put the result in the output string and adjust
3034    various state variables. Reallocate the output string if not enough
3035    space is available. Return a new reference to the object that
3036    was put in the output buffer, or Py_None, if the mapping was undefined
3037    (in which case no character was written) or NULL, if a
3038    reallocation error ocurred. The called must decref the result */
3039 static
3040 PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3041     PyObject **outobj, int *outpos)
3042 {
3043     PyObject *rep = charmapencode_lookup(c, mapping);
3044
3045     if (rep==NULL)
3046         return NULL;
3047     else if (rep==Py_None)
3048         return rep;
3049     else {
3050         char *outstart = PyString_AS_STRING(*outobj);
3051         int outsize = PyString_GET_SIZE(*outobj);
3052         if (PyInt_Check(rep)) {
3053             int requiredsize = *outpos+1;
3054             if (outsize<requiredsize) {
3055                 /* exponentially overallocate to minimize reallocations */
3056                 if (requiredsize < 2*outsize)
3057                     requiredsize = 2*outsize;
3058                 if (_PyString_Resize(outobj, requiredsize)) {
3059                     Py_DECREF(rep);
3060                     return NULL;
3061                 }
3062                 outstart = PyString_AS_STRING(*outobj);
3063             }
3064             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3065         }
3066         else {
3067             const char *repchars = PyString_AS_STRING(rep);
3068             int repsize = PyString_GET_SIZE(rep);
3069             int requiredsize = *outpos+repsize;
3070             if (outsize<requiredsize) {
3071                 /* exponentially overallocate to minimize reallocations */
3072                 if (requiredsize < 2*outsize)
3073                     requiredsize = 2*outsize;
3074                 if (_PyString_Resize(outobj, requiredsize)) {
3075                     Py_DECREF(rep);
3076                     return NULL;
3077                 }
3078                 outstart = PyString_AS_STRING(*outobj);
3079             }
3080             memcpy(outstart + *outpos, repchars, repsize);
3081             *outpos += repsize;
3082         }
3083     }
3084     return rep;
3085 }
3086
3087 /* handle an error in PyUnicode_EncodeCharmap
3088    Return 0 on success, -1 on error */
3089 static
3090 int charmap_encoding_error(
3091     const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3092     PyObject **exceptionObject,
3093     int *known_errorHandler, PyObject **errorHandler, const char *errors,
3094     PyObject **res, int *respos)
3095 {
3096     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3097     int repsize;
3098     int newpos;
3099     Py_UNICODE *uni2;
3100     /* startpos for collecting unencodable chars */
3101     int collstartpos = *inpos;
3102     int collendpos = *inpos+1;
3103     int collpos;
3104     char *encoding = "charmap";
3105     char *reason = "character maps to <undefined>";
3106
3107     PyObject *x;
3108     /* find all unencodable characters */
3109     while (collendpos < size) {
3110         x = charmapencode_lookup(p[collendpos], mapping);
3111         if (x==NULL)
3112             return -1;
3113         else if (x!=Py_None) {
3114             Py_DECREF(x);
3115             break;
3116         }
3117         Py_DECREF(x);
3118         ++collendpos;
3119     }
3120     /* cache callback name lookup
3121      * (if not done yet, i.e. it's the first error) */
3122     if (*known_errorHandler==-1) {
3123         if ((errors==NULL) || (!strcmp(errors, "strict")))
3124             *known_errorHandler = 1;
3125         else if (!strcmp(errors, "replace"))
3126             *known_errorHandler = 2;
3127         else if (!strcmp(errors, "ignore"))
3128             *known_errorHandler = 3;
3129         else if (!strcmp(errors, "xmlcharrefreplace"))
3130             *known_errorHandler = 4;
3131         else
3132             *known_errorHandler = 0;
3133     }
3134     switch (*known_errorHandler) {
3135         case 1: /* strict */
3136             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3137             return -1;
3138         case 2: /* replace */
3139             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3140                 x = charmapencode_output('?', mapping, res, respos);
3141                 if (x==NULL) {
3142                     return -1;
3143                 }
3144                 else if (x==Py_None) {
3145                     Py_DECREF(x);
3146                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3147                     return -1;
3148                 }
3149                 Py_DECREF(x);
3150             }
3151             /* fall through */
3152         case 3: /* ignore */
3153             *inpos = collendpos;
3154             break;
3155         case 4: /* xmlcharrefreplace */
3156             /* generate replacement (temporarily (mis)uses p) */
3157             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3158                 char buffer[2+29+1+1];
3159                 char *cp;
3160                 sprintf(buffer, "&#%d;", (int)p[collpos]);
3161                 for (cp = buffer; *cp; ++cp) {
3162                     x = charmapencode_output(*cp, mapping, res, respos);
3163                     if (x==NULL)
3164                         return -1;
3165                     else if (x==Py_None) {
3166                         Py_DECREF(x);
3167                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3168                         return -1;
3169                     }
3170                     Py_DECREF(x);
3171                 }
3172             }
3173             *inpos = collendpos;
3174             break;
3175         default:
3176             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3177                 encoding, reason, p, size, exceptionObject,
3178                 collstartpos, collendpos, &newpos);
3179             if (repunicode == NULL)
3180                 return -1;
3181             /* generate replacement  */
3182             repsize = PyUnicode_GET_SIZE(repunicode);
3183             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3184                 x = charmapencode_output(*uni2, mapping, res, respos);
3185                 if (x==NULL) {
3186                     Py_DECREF(repunicode);
3187                     return -1;
3188                 }
3189                 else if (x==Py_None) {
3190                     Py_DECREF(repunicode);
3191                     Py_DECREF(x);
3192                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3193                     return -1;
3194                 }
3195                 Py_DECREF(x);
3196             }
3197             *inpos = newpos;
3198             Py_DECREF(repunicode);
3199     }
3200     return 0;
3201 }
3202
3203 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3204                                   int size,
3205                                   PyObject *mapping,
3206                                   const char *errors)
3207 {
3208     /* output object */
3209     PyObject *res = NULL;
3210     /* current input position */
3211     int inpos = 0;
3212     /* current output position */
3213     int respos = 0;
3214     PyObject *errorHandler = NULL;
3215     PyObject *exc = NULL;
3216     /* the following variable is used for caching string comparisons
3217      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3218      * 3=ignore, 4=xmlcharrefreplace */
3219     int known_errorHandler = -1;
3220
3221     /* Default to Latin-1 */
3222     if (mapping == NULL)
3223         return PyUnicode_EncodeLatin1(p, size, errors);
3224
3225     /* allocate enough for a simple encoding without
3226        replacements, if we need more, we'll resize */
3227     res = PyString_FromStringAndSize(NULL, size);
3228     if (res == NULL)
3229         goto onError;
3230     if (size == 0)
3231         return res;
3232
3233     while (inpos<size) {
3234         /* try to encode it */
3235         PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3236         if (x==NULL) /* error */
3237             goto onError;
3238         if (x==Py_None) { /* unencodable character */
3239             if (charmap_encoding_error(p, size, &inpos, mapping,
3240                 &exc,
3241                 &known_errorHandler, &errorHandler, errors,
3242                 &res, &respos)) {
3243                 Py_DECREF(x);
3244                 goto onError;
3245             }
3246         }
3247         else
3248             /* done with this character => adjust input position */
3249             ++inpos;
3250         Py_DECREF(x);
3251     }
3252
3253     /* Resize if we allocated to much */
3254     if (respos<PyString_GET_SIZE(res)) {
3255         if (_PyString_Resize(&res, respos))
3256             goto onError;
3257     }
3258     Py_XDECREF(exc);
3259     Py_XDECREF(errorHandler);
3260     return res;
3261
3262     onError:
3263     Py_XDECREF(res);
3264     Py_XDECREF(exc);
3265     Py_XDECREF(errorHandler);
3266     return NULL;
3267 }
3268
3269 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3270                                     PyObject *mapping)
3271 {
3272     if (!PyUnicode_Check(unicode) || mapping == NULL) {
3273         PyErr_BadArgument();
3274         return NULL;
3275     }
3276     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3277                                    PyUnicode_GET_SIZE(unicode),
3278                                    mapping,
3279                                    NULL);
3280 }
3281
3282 /* create or adjust a UnicodeTranslateError */
3283 static void make_translate_exception(PyObject **exceptionObject,
3284     const Py_UNICODE *unicode, int size,
3285     int startpos, int endpos,
3286     const char *reason)
3287 {
3288     if (*exceptionObject == NULL) {
3289         *exceptionObject = PyUnicodeTranslateError_Create(
3290             unicode, size, startpos, endpos, reason);
3291     }
3292     else {
3293         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3294             goto onError;
3295         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3296             goto onError;
3297         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3298             goto onError;
3299         return;
3300         onError:
3301         Py_DECREF(*exceptionObject);
3302         *exceptionObject = NULL;
3303     }
3304 }
3305
3306 /* raises a UnicodeTranslateError */
3307 static void raise_translate_exception(PyObject **exceptionObject,
3308     const Py_UNICODE *unicode, int size,
3309     int startpos, int endpos,
3310     const char *reason)
3311 {
3312     make_translate_exception(exceptionObject,
3313         unicode, size, startpos, endpos, reason);
3314     if (*exceptionObject != NULL)
3315         PyCodec_StrictErrors(*exceptionObject);
3316 }
3317
3318 /* error handling callback helper:
3319    build arguments, call the callback and check the arguments,
3320    put the result into newpos and return the replacement string, which
3321    has to be freed by the caller */
3322 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3323     PyObject **errorHandler,
3324     const char *reason,
3325     const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3326     int startpos, int endpos,
3327     int *newpos)
3328 {
3329     static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3330
3331     PyObject *restuple;
3332     PyObject *resunicode;
3333
3334     if (*errorHandler == NULL) {
3335         *errorHandler = PyCodec_LookupError(errors);
3336         if (*errorHandler == NULL)
3337             return NULL;
3338     }
3339
3340     make_translate_exception(exceptionObject,
3341         unicode, size, startpos, endpos, reason);
3342     if (*exceptionObject == NULL)
3343         return NULL;
3344
3345     restuple = PyObject_CallFunctionObjArgs(
3346         *errorHandler, *exceptionObject, NULL);
3347     if (restuple == NULL)
3348         return NULL;
3349     if (!PyTuple_Check(restuple)) {
3350         PyErr_Format(PyExc_TypeError, &argparse[4]);
3351         Py_DECREF(restuple);
3352         return NULL;
3353     }
3354     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3355         &resunicode, newpos)) {
3356         Py_DECREF(restuple);
3357         return NULL;
3358     }
3359     if (*newpos<0)
3360         *newpos = size+*newpos;
3361     if (*newpos<0 || *newpos>size) {
3362         PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3363         Py_DECREF(restuple);
3364         return NULL;
3365     }
3366     Py_INCREF(resunicode);
3367     Py_DECREF(restuple);
3368     return resunicode;
3369 }
3370
3371 /* Lookup the character ch in the mapping and put the result in result,
3372    which must be decrefed by the caller.
3373    Return 0 on success, -1 on error */
3374 static
3375 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3376 {
3377     PyObject *w = PyInt_FromLong((long)c);
3378     PyObject *x;
3379
3380     if (w == NULL)
3381          return -1;
3382     x = PyObject_GetItem(mapping, w);
3383     Py_DECREF(w);
3384     if (x == NULL) {
3385         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3386             /* No mapping found means: use 1:1 mapping. */
3387             PyErr_Clear();
3388             *result = NULL;
3389             return 0;
3390         } else
3391             return -1;
3392     }
3393     else if (x == Py_None) {
3394         *result = x;
3395         return 0;
3396     }
3397     else if (PyInt_Check(x)) {
3398         long value = PyInt_AS_LONG(x);
3399         long max = PyUnicode_GetMax();
3400         if (value < 0 || value > max) {
3401             PyErr_Format(PyExc_TypeError,
3402                              "character mapping must be in range(0x%lx)", max+1);
3403             Py_DECREF(x);
3404             return -1;
3405         }
3406         *result = x;
3407         return 0;
3408     }
3409     else if (PyUnicode_Check(x)) {
3410         *result = x;
3411         return 0;
3412     }
3413     else {
3414         /* wrong return value */
3415         PyErr_SetString(PyExc_TypeError,
3416               "character mapping must return integer, None or unicode");
3417         Py_DECREF(x);
3418         return -1;
3419     }
3420 }
3421 /* ensure that *outobj is at least requiredsize characters long,
3422 if not reallocate and adjust various state variables.
3423 Return 0 on success, -1 on error */
3424 static
3425 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3426     int requiredsize)
3427 {
3428     int oldsize = PyUnicode_GET_SIZE(*outobj);
3429     if (requiredsize > oldsize) {
3430         /* remember old output position */
3431         int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3432         /* exponentially overallocate to minimize reallocations */
3433         if (requiredsize < 2 * oldsize)
3434             requiredsize = 2 * oldsize;
3435         if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3436             return -1;
3437         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3438     }
3439     return 0;
3440 }
3441 /* lookup the character, put the result in the output string and adjust
3442    various state variables. Return a new reference to the object that
3443    was put in the output buffer in *result, or Py_None, if the mapping was
3444    undefined (in which case no character was written).
3445    The called must decref result.
3446    Return 0 on success, -1 on error. */
3447 static
3448 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3449     int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3450     PyObject **res)
3451 {
3452     if (charmaptranslate_lookup(*curinp, mapping, res))
3453         return -1;
3454     if (*res==NULL) {
3455         /* not found => default to 1:1 mapping */
3456         *(*outp)++ = *curinp;
3457     }
3458     else if (*res==Py_None)
3459         ;
3460     else if (PyInt_Check(*res)) {
3461         /* no overflow check, because we know that the space is enough */
3462         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3463     }
3464     else if (PyUnicode_Check(*res)) {
3465         int repsize = PyUnicode_GET_SIZE(*res);
3466         if (repsize==1) {
3467             /* no overflow check, because we know that the space is enough */
3468             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3469         }
3470         else if (repsize!=0) {
3471             /* more than one character */
3472             int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3473                 (insize - (curinp-startinp)) +
3474                 repsize - 1;
3475             if (charmaptranslate_makespace(outobj, outp, requiredsize))
3476                 return -1;
3477             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3478             *outp += repsize;
3479         }
3480     }
3481     else
3482         return -1;
3483     return 0;
3484 }
3485
3486 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3487                                      int size,
3488                                      PyObject *mapping,
3489                                      const char *errors)
3490 {
3491     /* output object */
3492     PyObject *res = NULL;
3493     /* pointers to the beginning and end+1 of input */
3494     const Py_UNICODE *startp = p;
3495     const Py_UNICODE *endp = p + size;
3496     /* pointer into the output */
3497     Py_UNICODE *str;
3498     /* current output position */
3499     int respos = 0;
3500     char *reason = "character maps to <undefined>";
3501     PyObject *errorHandler = NULL;
3502     PyObject *exc = NULL;
3503     /* the following variable is used for caching string comparisons
3504      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3505      * 3=ignore, 4=xmlcharrefreplace */
3506     int known_errorHandler = -1;
3507
3508     if (mapping == NULL) {
3509         PyErr_BadArgument();
3510         return NULL;
3511     }
3512
3513     /* allocate enough for a simple 1:1 translation without
3514        replacements, if we need more, we'll resize */
3515     res = PyUnicode_FromUnicode(NULL, size);
3516     if (res == NULL)
3517         goto onError;
3518     if (size == 0)
3519         return res;
3520     str = PyUnicode_AS_UNICODE(res);
3521
3522     while (p<endp) {
3523         /* try to encode it */
3524         PyObject *x = NULL;
3525         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3526             Py_XDECREF(x);
3527             goto onError;
3528         }
3529         Py_XDECREF(x);
3530         if (x!=Py_None) /* it worked => adjust input pointer */
3531             ++p;
3532         else { /* untranslatable character */
3533             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3534             int repsize;
3535             int newpos;
3536             Py_UNICODE *uni2;
3537             /* startpos for collecting untranslatable chars */
3538             const Py_UNICODE *collstart = p;
3539             const Py_UNICODE *collend = p+1;
3540             const Py_UNICODE *coll;
3541
3542             /* find all untranslatable characters */
3543             while (collend < endp) {
3544                 if (charmaptranslate_lookup(*collend, mapping, &x))
3545                     goto onError;
3546                 Py_XDECREF(x);
3547                 if (x!=Py_None)
3548                     break;
3549                 ++collend;
3550             }
3551             /* cache callback name lookup
3552              * (if not done yet, i.e. it's the first error) */
3553             if (known_errorHandler==-1) {
3554                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3555                     known_errorHandler = 1;
3556                 else if (!strcmp(errors, "replace"))
3557                     known_errorHandler = 2;
3558                 else if (!strcmp(errors, "ignore"))
3559                     known_errorHandler = 3;
3560                 else if (!strcmp(errors, "xmlcharrefreplace"))
3561                     known_errorHandler = 4;
3562                 else
3563                     known_errorHandler = 0;
3564             }
3565             switch (known_errorHandler) {
3566                 case 1: /* strict */
3567                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3568                     goto onError;
3569                 case 2: /* replace */
3570                     /* No need to check for space, this is a 1:1 replacement */
3571                     for (coll = collstart; coll<collend; ++coll)
3572                         *str++ = '?';
3573                     /* fall through */
3574                 case 3: /* ignore */
3575                     p = collend;
3576                     break;
3577                 case 4: /* xmlcharrefreplace */
3578                     /* generate replacement (temporarily (mis)uses p) */
3579                     for (p = collstart; p < collend; ++p) {
3580                         char buffer[2+29+1+1];
3581                         char *cp;
3582                         sprintf(buffer, "&#%d;", (int)*p);
3583                         if (charmaptranslate_makespace(&res, &str,
3584                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3585                             goto onError;
3586                         for (cp = buffer; *cp; ++cp)
3587                             *str++ = *cp;
3588                     }
3589                     p = collend;
3590                     break;
3591                 default:
3592                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3593                         reason, startp, size, &exc,
3594                         collstart-startp, collend-startp, &newpos);
3595                     if (repunicode == NULL)
3596                         goto onError;
3597                     /* generate replacement  */
3598                     repsize = PyUnicode_GET_SIZE(repunicode);
3599                     if (charmaptranslate_makespace(&res, &str,
3600                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3601                         Py_DECREF(repunicode);
3602                         goto onError;
3603                     }
3604                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3605                         *str++ = *uni2;
3606                     p = startp + newpos;
3607                     Py_DECREF(repunicode);
3608             }
3609         }
3610     }
3611     /* Resize if we allocated to much */
3612     respos = str-PyUnicode_AS_UNICODE(res);
3613     if (respos<PyUnicode_GET_SIZE(res)) {
3614         if (_PyUnicode_Resize(&res, respos) < 0)
3615             goto onError;
3616     }
3617     Py_XDECREF(exc);
3618     Py_XDECREF(errorHandler);
3619     return res;
3620
3621     onError:
3622     Py_XDECREF(res);
3623     Py_XDECREF(exc);
3624     Py_XDECREF(errorHandler);
3625     return NULL;
3626 }
3627
3628 PyObject *PyUnicode_Translate(PyObject *str,
3629                               PyObject *mapping,
3630                               const char *errors)
3631 {
3632     PyObject *result;
3633
3634     str = PyUnicode_FromObject(str);
3635     if (str == NULL)
3636         goto onError;
3637     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3638                                         PyUnicode_GET_SIZE(str),
3639                                         mapping,
3640                                         errors);
3641     Py_DECREF(str);
3642     return result;
3643
3644  onError:
3645     Py_XDECREF(str);
3646     return NULL;
3647 }
3648
3649 /* --- Decimal Encoder ---------------------------------------------------- */
3650
3651 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3652                             int length,
3653                             char *output,
3654                             const char *errors)
3655 {
3656     Py_UNICODE *p, *end;
3657     PyObject *errorHandler = NULL;
3658     PyObject *exc = NULL;
3659     const char *encoding = "decimal";
3660     const char *reason = "invalid decimal Unicode string";
3661     /* the following variable is used for caching string comparisons
3662      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3663     int known_errorHandler = -1;
3664
3665     if (output == NULL) {
3666         PyErr_BadArgument();
3667         return -1;
3668     }
3669
3670     p = s;
3671     end = s + length;
3672     while (p < end) {
3673         register Py_UNICODE ch = *p;
3674         int decimal;
3675         PyObject *repunicode;
3676         int repsize;
3677         int newpos;
3678         Py_UNICODE *uni2;
3679         Py_UNICODE *collstart;
3680         Py_UNICODE *collend;
3681
3682         if (Py_UNICODE_ISSPACE(ch)) {
3683             *output++ = ' ';
3684             ++p;
3685             continue;
3686         }
3687         decimal = Py_UNICODE_TODECIMAL(ch);
3688         if (decimal >= 0) {
3689             *output++ = '0' + decimal;
3690             ++p;
3691             continue;
3692         }
3693         if (0 < ch && ch < 256) {
3694             *output++ = (char)ch;
3695             ++p;
3696             continue;
3697         }
3698         /* All other characters are considered unencodable */
3699         collstart = p;
3700         collend = p+1;
3701         while (collend < end) {
3702             if ((0 < *collend && *collend < 256) ||
3703                 !Py_UNICODE_ISSPACE(*collend) ||
3704                 Py_UNICODE_TODECIMAL(*collend))
3705                 break;
3706         }
3707         /* cache callback name lookup
3708          * (if not done yet, i.e. it's the first error) */
3709         if (known_errorHandler==-1) {
3710             if ((errors==NULL) || (!strcmp(errors, "strict")))
3711                 known_errorHandler = 1;
3712             else if (!strcmp(errors, "replace"))
3713                 known_errorHandler = 2;
3714             else if (!strcmp(errors, "ignore"))
3715                 known_errorHandler = 3;
3716             else if (!strcmp(errors, "xmlcharrefreplace"))
3717                 known_errorHandler = 4;
3718             else
3719                 known_errorHandler = 0;
3720         }
3721         switch (known_errorHandler) {
3722             case 1: /* strict */
3723                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3724                 goto onError;
3725             case 2: /* replace */
3726                 for (p = collstart; p < collend; ++p)
3727                     *output++ = '?';
3728                 /* fall through */
3729             case 3: /* ignore */
3730                 p = collend;
3731                 break;
3732             case 4: /* xmlcharrefreplace */
3733                 /* generate replacement (temporarily (mis)uses p) */
3734                 for (p = collstart; p < collend; ++p)
3735                     output += sprintf(output, "&#%d;", (int)*p);
3736                 p = collend;
3737                 break;
3738             default:
3739                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3740                     encoding, reason, s, length, &exc,
3741                     collstart-s, collend-s, &newpos);
3742                 if (repunicode == NULL)
3743                     goto onError;
3744                 /* generate replacement  */
3745                 repsize = PyUnicode_GET_SIZE(repunicode);
3746                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3747                     Py_UNICODE ch = *uni2;
3748                     if (Py_UNICODE_ISSPACE(ch))
3749                         *output++ = ' ';
3750                     else {
3751                         decimal = Py_UNICODE_TODECIMAL(ch);
3752                         if (decimal >= 0)
3753                             *output++ = '0' + decimal;
3754                         else if (0 < ch && ch < 256)
3755                             *output++ = (char)ch;
3756                         else {
3757                             Py_DECREF(repunicode);
3758                             raise_encode_exception(&exc, encoding,
3759                                 s, length, collstart-s, collend-s, reason);
3760                             goto onError;
3761                         }
3762                     }
3763                 }
3764                 p = s + newpos;
3765                 Py_DECREF(repunicode);
3766         }
3767     }
3768     /* 0-terminate the output string */
3769     *output++ = '\0';
3770     Py_XDECREF(exc);
3771     Py_XDECREF(errorHandler);
3772     return 0;
3773
3774  onError:
3775     Py_XDECREF(exc);
3776     Py_XDECREF(errorHandler);
3777     return -1;
3778 }
3779
3780 /* --- Helpers ------------------------------------------------------------ */
3781
3782 static
3783 int count(PyUnicodeObject *self,
3784           int start,
3785           int end,
3786           PyUnicodeObject *substring)
3787 {
3788     int count = 0;
3789
3790     if (start < 0)
3791         start += self->length;
3792     if (start < 0)
3793         start = 0;
3794     if (end > self->length)
3795         end = self->length;
3796     if (end < 0)
3797         end += self->length;
3798     if (end < 0)
3799         end = 0;
3800
3801     if (substring->length == 0)
3802         return (end - start + 1);
3803
3804     end -= substring->length;
3805
3806     while (start <= end)
3807         if (Py_UNICODE_MATCH(self, start, substring)) {
3808             count++;
3809             start += substring->length;
3810         } else
3811             start++;
3812
3813     return count;
3814 }
3815
3816 int PyUnicode_Count(PyObject *str,
3817                     PyObject *substr,
3818                     int start,
3819                     int end)
3820 {
3821     int result;
3822
3823     str = PyUnicode_FromObject(str);
3824     if (str == NULL)
3825         return -1;
3826     substr = PyUnicode_FromObject(substr);
3827     if (substr == NULL) {
3828         Py_DECREF(str);
3829         return -1;
3830     }
3831
3832     result = count((PyUnicodeObject *)str,
3833                    start, end,
3834                    (PyUnicodeObject *)substr);
3835
3836     Py_DECREF(str);
3837     Py_DECREF(substr);
3838     return result;
3839 }
3840
3841 static
3842 int findstring(PyUnicodeObject *self,
3843                PyUnicodeObject *substring,
3844                int start,
3845                int end,
3846                int direction)
3847 {
3848     if (start < 0)
3849         start += self->length;
3850     if (start < 0)
3851         start = 0;
3852
3853     if (end > self->length)
3854         end = self->length;
3855     if (end < 0)
3856         end += self->length;
3857     if (end < 0)
3858         end = 0;
3859
3860     if (substring->length == 0)
3861         return (direction > 0) ? start : end;
3862
3863     end -= substring->length;
3864
3865     if (direction < 0) {
3866         for (; end >= start; end--)
3867             if (Py_UNICODE_MATCH(self, end, substring))
3868                 return end;
3869     } else {
3870         for (; start <= end; start++)
3871             if (Py_UNICODE_MATCH(self, start, substring))
3872                 return start;
3873     }
3874
3875     return -1;
3876 }
3877
3878 int PyUnicode_Find(PyObject *str,
3879                    PyObject *substr,
3880                    int start,
3881                    int end,
3882                    int direction)
3883 {
3884     int result;
3885
3886     str = PyUnicode_FromObject(str);
3887     if (str == NULL)
3888         return -2;
3889     substr = PyUnicode_FromObject(substr);
3890     if (substr == NULL) {
3891         Py_DECREF(str);
3892         return -2;
3893     }
3894
3895     result = findstring((PyUnicodeObject *)str,
3896                         (PyUnicodeObject *)substr,
3897                         start, end, direction);
3898     Py_DECREF(str);
3899     Py_DECREF(substr);
3900     return result;
3901 }
3902
3903 static
3904 int tailmatch(PyUnicodeObject *self,
3905               PyUnicodeObject *substring,
3906               int start,
3907               int end,
3908               int direction)
3909 {
3910     if (start < 0)
3911         start += self->length;
3912     if (start < 0)
3913         start = 0;
3914
3915     if (substring->length == 0)
3916         return 1;
3917
3918     if (end > self->length)
3919         end = self->length;
3920     if (end < 0)
3921         end += self->length;
3922     if (end < 0)
3923         end = 0;
3924
3925     end -= substring->length;
3926     if (end < start)
3927         return 0;
3928
3929     if (direction > 0) {
3930         if (Py_UNICODE_MATCH(self, end, substring))
3931             return 1;
3932     } else {
3933         if (Py_UNICODE_MATCH(self, start, substring))
3934             return 1;
3935     }
3936
3937     return 0;
3938 }
3939
3940 int PyUnicode_Tailmatch(PyObject *str,
3941                         PyObject *substr,
3942                         int start,
3943                         int end,
3944                         int direction)
3945 {
3946     int result;
3947
3948     str = PyUnicode_FromObject(str);
3949     if (str == NULL)
3950         return -1;
3951     substr = PyUnicode_FromObject(substr);
3952     if (substr == NULL) {
3953         Py_DECREF(substr);
3954         return -1;
3955     }
3956
3957     result = tailmatch((PyUnicodeObject *)str,
3958                        (PyUnicodeObject *)substr,
3959                        start, end, direction);
3960     Py_DECREF(str);
3961     Py_DECREF(substr);
3962     return result;
3963 }
3964
3965 static
3966 const Py_UNICODE *findchar(const Py_UNICODE *s,
3967                      int size,
3968                      Py_UNICODE ch)
3969 {
3970     /* like wcschr, but doesn't stop at NULL characters */
3971
3972     while (size-- > 0) {
3973         if (*s == ch)
3974             return s;
3975         s++;
3976     }
3977
3978     return NULL;
3979 }
3980
3981 /* Apply fixfct filter to the Unicode object self and return a
3982    reference to the modified object */
3983
3984 static
3985 PyObject *fixup(PyUnicodeObject *self,
3986                 int (*fixfct)(PyUnicodeObject *s))
3987 {
3988
3989     PyUnicodeObject *u;
3990
3991     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3992     if (u == NULL)
3993         return NULL;
3994
3995     Py_UNICODE_COPY(u->str, self->str, self->length);
3996
3997     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3998         /* fixfct should return TRUE if it modified the buffer. If
3999            FALSE, return a reference to the original buffer instead
4000            (to save space, not time) */
4001         Py_INCREF(self);
4002         Py_DECREF(u);
4003         return (PyObject*) self;
4004     }
4005     return (PyObject*) u;
4006 }
4007
4008 static
4009 int fixupper(PyUnicodeObject *self)
4010 {
4011     int len = self->length;
4012     Py_UNICODE *s = self->str;
4013     int status = 0;
4014
4015     while (len-- > 0) {
4016         register Py_UNICODE ch;
4017
4018         ch = Py_UNICODE_TOUPPER(*s);
4019         if (ch != *s) {
4020             status = 1;
4021             *s = ch;
4022         }
4023         s++;
4024     }
4025
4026     return status;
4027 }
4028
4029 static
4030 int fixlower(PyUnicodeObject *self)
4031 {
4032     int len = self->length;
4033     Py_UNICODE *s = self->str;
4034     int status = 0;
4035
4036     while (len-- > 0) {
4037         register Py_UNICODE ch;
4038
4039         ch = Py_UNICODE_TOLOWER(*s);
4040         if (ch != *s) {
4041             status = 1;
4042             *s = ch;
4043         }
4044         s++;
4045     }
4046
4047     return status;
4048 }
4049
4050 static
4051 int fixswapcase(PyUnicodeObject *self)
4052 {
4053     int len = self->length;
4054     Py_UNICODE *s = self->str;
4055     int status = 0;
4056
4057     while (len-- > 0) {
4058         if (Py_UNICODE_ISUPPER(*s)) {
4059             *s = Py_UNICODE_TOLOWER(*s);
4060             status = 1;
4061         } else if (Py_UNICODE_ISLOWER(*s)) {
4062             *s = Py_UNICODE_TOUPPER(*s);
4063             status = 1;
4064         }
4065         s++;
4066     }
4067
4068     return status;
4069 }
4070
4071 static
4072 int fixcapitalize(PyUnicodeObject *self)
4073 {
4074     int len = self->length;
4075     Py_UNICODE *s = self->str;
4076     int status = 0;
4077
4078     if (len == 0)
4079         return 0;
4080     if (Py_UNICODE_ISLOWER(*s)) {
4081         *s = Py_UNICODE_TOUPPER(*s);
4082         status = 1;
4083     }
4084     s++;
4085     while (--len > 0) {
4086         if (Py_UNICODE_ISUPPER(*s)) {
4087             *s = Py_UNICODE_TOLOWER(*s);
4088             status = 1;
4089         }
4090         s++;
4091     }
4092     return status;
4093 }
4094
4095 static
4096 int fixtitle(PyUnicodeObject *self)
4097 {
4098     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4099     register Py_UNICODE *e;
4100     int previous_is_cased;
4101
4102     /* Shortcut for single character strings */
4103     if (PyUnicode_GET_SIZE(self) == 1) {
4104         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4105         if (*p != ch) {
4106             *p = ch;
4107             return 1;
4108         }
4109         else
4110             return 0;
4111     }
4112
4113     e = p + PyUnicode_GET_SIZE(self);
4114     previous_is_cased = 0;
4115     for (; p < e; p++) {
4116         register const Py_UNICODE ch = *p;
4117
4118         if (previous_is_cased)
4119             *p = Py_UNICODE_TOLOWER(ch);
4120         else
4121             *p = Py_UNICODE_TOTITLE(ch);
4122
4123         if (Py_UNICODE_ISLOWER(ch) ||
4124             Py_UNICODE_ISUPPER(ch) ||
4125             Py_UNICODE_ISTITLE(ch))
4126             previous_is_cased = 1;
4127         else
4128             previous_is_cased = 0;
4129     }
4130     return 1;
4131 }
4132
4133 PyObject *
4134 PyUnicode_Join(PyObject *separator, PyObject *seq)
4135 {
4136     PyObject *internal_separator = NULL;
4137     const Py_UNICODE blank = ' ';
4138     const Py_UNICODE *sep = &blank;
4139     size_t seplen = 1;
4140     PyUnicodeObject *res = NULL; /* the result */
4141     size_t res_alloc = 100;  /* # allocated bytes for string in res */
4142     size_t res_used;         /* # used bytes */
4143     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4144     PyObject *fseq;          /* PySequence_Fast(seq) */
4145     int seqlen;              /* len(fseq) -- number of items in sequence */
4146     PyObject *item;
4147     int i;
4148
4149     fseq = PySequence_Fast(seq, "");
4150     if (fseq == NULL) {
4151         return NULL;
4152     }
4153
4154     /* Grrrr.  A codec may be invoked to convert str objects to
4155      * Unicode, and so it's possible to call back into Python code
4156      * during PyUnicode_FromObject(), and so it's possible for a sick
4157      * codec to change the size of fseq (if seq is a list).  Therefore
4158      * we have to keep refetching the size -- can't assume seqlen
4159      * is invariant.
4160      */
4161     seqlen = PySequence_Fast_GET_SIZE(fseq);
4162     /* If empty sequence, return u"". */
4163     if (seqlen == 0) {
4164         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4165         goto Done;
4166     }
4167     /* If singleton sequence with an exact Unicode, return that. */
4168     if (seqlen == 1) {
4169         item = PySequence_Fast_GET_ITEM(fseq, 0);
4170         if (PyUnicode_CheckExact(item)) {
4171             Py_INCREF(item);
4172             res = (PyUnicodeObject *)item;
4173             goto Done;
4174         }
4175     }
4176
4177     /* At least two items to join, or one that isn't exact Unicode. */
4178     if (seqlen > 1) {
4179         /* Set up sep and seplen -- they're needed. */
4180         if (separator == NULL) {
4181             sep = &blank;
4182             seplen = 1;
4183         }
4184         else {
4185             internal_separator = PyUnicode_FromObject(separator);
4186             if (internal_separator == NULL)
4187                 goto onError;
4188             sep = PyUnicode_AS_UNICODE(internal_separator);
4189             seplen = PyUnicode_GET_SIZE(internal_separator);
4190             /* In case PyUnicode_FromObject() mutated seq. */
4191             seqlen = PySequence_Fast_GET_SIZE(fseq);
4192         }
4193     }
4194
4195     /* Get space. */
4196     res = _PyUnicode_New((int)res_alloc);
4197     if (res == NULL)
4198         goto onError;
4199     res_p = PyUnicode_AS_UNICODE(res);
4200     res_used = 0;
4201
4202     for (i = 0; i < seqlen; ++i) {
4203         size_t itemlen;
4204         size_t new_res_used;
4205
4206         item = PySequence_Fast_GET_ITEM(fseq, i);
4207         /* Convert item to Unicode. */
4208         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4209             PyErr_Format(PyExc_TypeError,
4210                          "sequence item %i: expected string or Unicode,"
4211                          " %.80s found",
4212                          i, item->ob_type->tp_name);
4213             goto onError;
4214         }
4215         item = PyUnicode_FromObject(item);
4216         if (item == NULL)
4217             goto onError;
4218         /* We own a reference to item from here on. */
4219
4220         /* In case PyUnicode_FromObject() mutated seq. */
4221         seqlen = PySequence_Fast_GET_SIZE(fseq);
4222
4223         /* Make sure we have enough space for the separator and the item. */
4224         itemlen = PyUnicode_GET_SIZE(item);
4225         new_res_used = res_used + itemlen;
4226         if (new_res_used < res_used ||  new_res_used > INT_MAX)
4227             goto Overflow;
4228         if (i < seqlen - 1) {
4229             new_res_used += seplen;
4230             if (new_res_used < res_used ||  new_res_used > INT_MAX)
4231                 goto Overflow;
4232         }
4233         if (new_res_used > res_alloc) {
4234             /* double allocated size until it's big enough */
4235             do {
4236                 size_t oldsize = res_alloc;
4237                 res_alloc += res_alloc;
4238                 if (res_alloc < oldsize || res_alloc > INT_MAX)
4239                     goto Overflow;
4240             } while (new_res_used > res_alloc);
4241             if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
4242                 Py_DECREF(item);
4243                 goto onError;
4244             }
4245             res_p = PyUnicode_AS_UNICODE(res) + res_used;
4246         }
4247
4248         /* Copy item, and maybe the separator. */
4249         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4250         res_p += itemlen;
4251         if (i < seqlen - 1) {
4252             Py_UNICODE_COPY(res_p, sep, (int)seplen);
4253             res_p += seplen;
4254         }
4255         Py_DECREF(item);
4256         res_used = new_res_used;
4257     }
4258
4259     /* Shrink res to match the used area; this probably can't fail,
4260      * but it's cheap to check.
4261      */
4262     if (_PyUnicode_Resize(&res, (int)res_used) < 0)
4263         goto onError;
4264
4265  Done:
4266     Py_XDECREF(internal_separator);
4267     Py_DECREF(fseq);
4268     return (PyObject *)res;
4269
4270  Overflow:
4271     PyErr_SetString(PyExc_OverflowError,
4272                     "join() is too long for a Python string");
4273     Py_DECREF(item);
4274     /* fall through */
4275
4276  onError:
4277     Py_XDECREF(internal_separator);
4278     Py_DECREF(fseq);
4279     Py_XDECREF(res);
4280     return NULL;
4281 }
4282
4283 static
4284 PyUnicodeObject *pad(PyUnicodeObject *self,
4285                      int left,
4286                      int right,
4287                      Py_UNICODE fill)
4288 {
4289     PyUnicodeObject *u;
4290
4291     if (left < 0)
4292         left = 0;
4293     if (right < 0)
4294         right = 0;
4295
4296     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4297         Py_INCREF(self);
4298         return self;
4299     }
4300
4301     u = _PyUnicode_New(left + self->length + right);
4302     if (u) {
4303         if (left)
4304             Py_UNICODE_FILL(u->str, fill, left);
4305         Py_UNICODE_COPY(u->str + left, self->str, self->length);
4306         if (right)
4307             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4308     }
4309
4310     return u;
4311 }
4312
4313 #define SPLIT_APPEND(data, left, right)                                 \
4314         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4315         if (!str)                                                       \
4316             goto onError;                                               \
4317         if (PyList_Append(list, str)) {                                 \
4318             Py_DECREF(str);                                             \
4319             goto onError;                                               \
4320         }                                                               \
4321         else                                                            \
4322             Py_DECREF(str);
4323
4324 #define SPLIT_INSERT(data, left, right)                                 \
4325         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4326         if (!str)                                                       \
4327             goto onError;                                               \
4328         if (PyList_Insert(list, 0, str)) {                              \
4329             Py_DECREF(str);                                             \
4330             goto onError;                                               \
4331         }                                                               \
4332         else                                                            \
4333             Py_DECREF(str);
4334
4335 static
4336 PyObject *split_whitespace(PyUnicodeObject *self,
4337                            PyObject *list,
4338                            int maxcount)
4339 {
4340     register int i;
4341     register int j;
4342     int len = self->length;
4343     PyObject *str;
4344
4345     for (i = j = 0; i < len; ) {
4346         /* find a token */
4347         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4348             i++;
4349         j = i;
4350         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4351             i++;
4352         if (j < i) {
4353             if (maxcount-- <= 0)
4354                 break;
4355             SPLIT_APPEND(self->str, j, i);
4356             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4357                 i++;
4358             j = i;
4359         }
4360     }
4361     if (j < len) {
4362         SPLIT_APPEND(self->str, j, len);
4363     }
4364     return list;
4365
4366  onError:
4367     Py_DECREF(list);
4368     return NULL;
4369 }
4370
4371 PyObject *PyUnicode_Splitlines(PyObject *string,
4372                                int keepends)
4373 {
4374     register int i;
4375     register int j;
4376     int len;
4377     PyObject *list;
4378     PyObject *str;
4379     Py_UNICODE *data;
4380
4381     string = PyUnicode_FromObject(string);
4382     if (string == NULL)
4383         return NULL;
4384     data = PyUnicode_AS_UNICODE(string);
4385     len = PyUnicode_GET_SIZE(string);
4386
4387     list = PyList_New(0);
4388     if (!list)
4389         goto onError;
4390
4391     for (i = j = 0; i < len; ) {
4392         int eol;
4393
4394         /* Find a line and append it */
4395         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4396             i++;
4397
4398         /* Skip the line break reading CRLF as one line break */
4399         eol = i;
4400         if (i < len) {
4401             if (data[i] == '\r' && i + 1 < len &&
4402                 data[i+1] == '\n')
4403                 i += 2;
4404             else
4405                 i++;
4406             if (keepends)
4407                 eol = i;
4408         }
4409         SPLIT_APPEND(data, j, eol);
4410         j = i;
4411     }
4412     if (j < len) {
4413         SPLIT_APPEND(data, j, len);
4414     }
4415
4416     Py_DECREF(string);
4417     return list;
4418
4419  onError:
4420     Py_DECREF(list);
4421     Py_DECREF(string);
4422     return NULL;
4423 }
4424
4425 static
4426 PyObject *split_char(PyUnicodeObject *self,
4427                      PyObject *list,
4428                      Py_UNICODE ch,
4429                      int maxcount)
4430 {
4431     register int i;
4432     register int j;
4433     int len = self->length;
4434     PyObject *str;
4435
4436     for (i = j = 0; i < len; ) {
4437         if (self->str[i] == ch) {
4438             if (maxcount-- <= 0)
4439                 break;
4440             SPLIT_APPEND(self->str, j, i);
4441             i = j = i + 1;
4442         } else
4443             i++;
4444     }
4445     if (j <= len) {
4446         SPLIT_APPEND(self->str, j, len);
4447     }
4448     return list;
4449
4450  onError:
4451     Py_DECREF(list);
4452     return NULL;
4453 }
4454
4455 static
4456 PyObject *split_substring(PyUnicodeObject *self,
4457                           PyObject *list,
4458                           PyUnicodeObject *substring,
4459                           int maxcount)
4460 {
4461     register int i;
4462     register int j;
4463     int len = self->length;
4464     int sublen = substring->length;
4465     PyObject *str;
4466
4467     for (i = j = 0; i <= len - sublen; ) {
4468         if (Py_UNICODE_MATCH(self, i, substring)) {
4469             if (maxcount-- <= 0)
4470                 break;
4471             SPLIT_APPEND(self->str, j, i);
4472             i = j = i + sublen;
4473         } else
4474             i++;
4475     }
4476     if (j <= len) {
4477         SPLIT_APPEND(self->str, j, len);
4478     }
4479     return list;
4480
4481  onError:
4482     Py_DECREF(list);
4483     return NULL;
4484 }
4485
4486 static
4487 PyObject *rsplit_whitespace(PyUnicodeObject *self,
4488                             PyObject *list,
4489                             int maxcount)
4490 {
4491     register int i;
4492     register int j;
4493     int len = self->length;
4494     PyObject *str;
4495
4496     for (i = j = len - 1; i >= 0; ) {
4497         /* find a token */
4498         while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4499             i--;
4500         j = i;
4501         while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4502             i--;
4503         if (j > i) {
4504             if (maxcount-- <= 0)
4505                 break;
4506             SPLIT_INSERT(self->str, i + 1, j + 1);
4507             while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4508                 i--;
4509             j = i;
4510         }
4511     }
4512     if (j >= 0) {
4513         SPLIT_INSERT(self->str, 0, j + 1);
4514     }
4515     return list;
4516
4517  onError:
4518     Py_DECREF(list);
4519     return NULL;
4520 }
4521
4522 static
4523 PyObject *rsplit_char(PyUnicodeObject *self,
4524                       PyObject *list,
4525                       Py_UNICODE ch,
4526                       int maxcount)
4527 {
4528     register int i;
4529     register int j;
4530     int len = self->length;
4531     PyObject *str;
4532
4533     for (i = j = len - 1; i >= 0; ) {
4534         if (self->str[i] == ch) {
4535             if (maxcount-- <= 0)
4536                 break;
4537             SPLIT_INSERT(self->str, i + 1, j + 1);
4538             j = i = i - 1;
4539         } else
4540             i--;
4541     }
4542     if (j >= -1) {
4543         SPLIT_INSERT(self->str, 0, j + 1);
4544     }
4545     return list;
4546
4547  onError:
4548     Py_DECREF(list);
4549     return NULL;
4550 }
4551
4552 static
4553 PyObject *rsplit_substring(PyUnicodeObject *self,
4554                            PyObject *list,
4555                            PyUnicodeObject *substring,
4556                            int maxcount)
4557 {
4558     register int i;
4559     register int j;
4560     int len = self->length;
4561     int sublen = substring->length;
4562     PyObject *str;
4563
4564     for (i = len - sublen, j = len; i >= 0; ) {
4565         if (Py_UNICODE_MATCH(self, i, substring)) {
4566             if (maxcount-- <= 0)
4567                 break;
4568             SPLIT_INSERT(self->str, i + sublen, j);
4569             j = i;
4570             i -= sublen;
4571         } else
4572             i--;
4573     }
4574     if (j >= 0) {
4575         SPLIT_INSERT(self->str, 0, j);
4576     }
4577     return list;
4578
4579  onError:
4580     Py_DECREF(list);
4581     return NULL;
4582 }
4583
4584 #undef SPLIT_APPEND
4585 #undef SPLIT_INSERT
4586
4587 static
4588 PyObject *split(PyUnicodeObject *self,
4589                 PyUnicodeObject *substring,
4590                 int maxcount)
4591 {
4592     PyObject *list;
4593
4594     if (maxcount < 0)
4595         maxcount = INT_MAX;
4596
4597     list = PyList_New(0);
4598     if (!list)
4599         return NULL;
4600
4601     if (substring == NULL)
4602         return split_whitespace(self,list,maxcount);
4603
4604     else if (substring->length == 1)
4605         return split_char(self,list,substring->str[0],maxcount);
4606
4607     else if (substring->length == 0) {
4608         Py_DECREF(list);
4609         PyErr_SetString(PyExc_ValueError, "empty separator");
4610         return NULL;
4611     }
4612     else
4613         return split_substring(self,list,substring,maxcount);
4614 }
4615
4616 static
4617 PyObject *rsplit(PyUnicodeObject *self,
4618                  PyUnicodeObject *substring,
4619                  int maxcount)
4620 {
4621     PyObject *list;
4622
4623     if (maxcount < 0)
4624         maxcount = INT_MAX;
4625
4626     list = PyList_New(0);
4627     if (!list)
4628         return NULL;
4629
4630     if (substring == NULL)
4631         return rsplit_whitespace(self,list,maxcount);
4632
4633     else if (substring->length == 1)
4634         return rsplit_char(self,list,substring->str[0],maxcount);
4635
4636     else if (substring->length == 0) {
4637         Py_DECREF(list);
4638         PyErr_SetString(PyExc_ValueError, "empty separator");
4639         return NULL;
4640     }
4641     else
4642         return rsplit_substring(self,list,substring,maxcount);
4643 }
4644
4645 static
4646 PyObject *replace(PyUnicodeObject *self,
4647                   PyUnicodeObject *str1,
4648                   PyUnicodeObject *str2,
4649                   int maxcount)
4650 {
4651     PyUnicodeObject *u;
4652
4653     if (maxcount < 0)
4654         maxcount = INT_MAX;
4655
4656     if (str1->length == 1 && str2->length == 1) {
4657         int i;
4658
4659         /* replace characters */
4660         if (!findchar(self->str, self->length, str1->str[0]) &&
4661             PyUnicode_CheckExact(self)) {
4662             /* nothing to replace, return original string */
4663             Py_INCREF(self);
4664             u = self;
4665         } else {
4666             Py_UNICODE u1 = str1->str[0];
4667             Py_UNICODE u2 = str2->str[0];
4668
4669             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4670                 NULL,
4671                 self->length
4672                 );
4673             if (u != NULL) {
4674                 Py_UNICODE_COPY(u->str, self->str,
4675                                 self->length);
4676                 for (i = 0; i < u->length; i++)
4677                     if (u->str[i] == u1) {
4678                         if (--maxcount < 0)
4679                             break;
4680                         u->str[i] = u2;
4681                     }
4682         }
4683         }
4684
4685     } else {
4686         int n, i;
4687         Py_UNICODE *p;
4688
4689         /* replace strings */
4690         n = count(self, 0, self->length, str1);
4691         if (n > maxcount)
4692             n = maxcount;
4693         if (n == 0) {
4694             /* nothing to replace, return original string */
4695             if (PyUnicode_CheckExact(self)) {
4696                 Py_INCREF(self);
4697                 u = self;
4698             }
4699             else {
4700                 u = (PyUnicodeObject *)
4701                     PyUnicode_FromUnicode(self->str, self->length);
4702             }
4703         } else {
4704             u = _PyUnicode_New(
4705                 self->length + n * (str2->length - str1->length));
4706             if (u) {
4707                 i = 0;
4708                 p = u->str;
4709                 if (str1->length > 0) {
4710                     while (i <= self->length - str1->length)
4711                         if (Py_UNICODE_MATCH(self, i, str1)) {
4712                             /* replace string segment */
4713                             Py_UNICODE_COPY(p, str2->str, str2->length);
4714                             p += str2->length;
4715                             i += str1->length;
4716                             if (--n <= 0) {
4717                                 /* copy remaining part */
4718                                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4719                                 break;
4720                             }
4721                         } else
4722                             *p++ = self->str[i++];
4723                 } else {
4724                     while (n > 0) {
4725                         Py_UNICODE_COPY(p, str2->str, str2->length);
4726                         p += str2->length;
4727                         if (--n <= 0)
4728                             break;
4729                         *p++ = self->str[i++];
4730                     }
4731                     Py_UNICODE_COPY(p, self->str+i, self->length-i);
4732                 }
4733             }
4734         }
4735     }
4736
4737     return (PyObject *) u;
4738 }
4739
4740 /* --- Unicode Object Methods --------------------------------------------- */
4741
4742 PyDoc_STRVAR(title__doc__,
4743 "S.title() -> unicode\n\
4744 \n\
4745 Return a titlecased version of S, i.e. words start with title case\n\
4746 characters, all remaining cased characters have lower case.");
4747
4748 static PyObject*
4749 unicode_title(PyUnicodeObject *self)
4750 {
4751     return fixup(self, fixtitle);
4752 }
4753
4754 PyDoc_STRVAR(capitalize__doc__,
4755 "S.capitalize() -> unicode\n\
4756 \n\
4757 Return a capitalized version of S, i.e. make the first character\n\
4758 have upper case.");
4759
4760 static PyObject*
4761 unicode_capitalize(PyUnicodeObject *self)
4762 {
4763     return fixup(self, fixcapitalize);
4764 }
4765
4766 #if 0
4767 PyDoc_STRVAR(capwords__doc__,
4768 "S.capwords() -> unicode\n\
4769 \n\
4770 Apply .capitalize() to all words in S and return the result with\n\
4771 normalized whitespace (all whitespace strings are replaced by ' ').");
4772
4773 static PyObject*
4774 unicode_capwords(PyUnicodeObject *self)
4775 {
4776     PyObject *list;
4777     PyObject *item;
4778     int i;
4779
4780     /* Split into words */
4781     list = split(self, NULL, -1);
4782     if (!list)
4783         return NULL;
4784
4785     /* Capitalize each word */
4786     for (i = 0; i < PyList_GET_SIZE(list); i++) {
4787         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4788                      fixcapitalize);
4789         if (item == NULL)
4790             goto onError;
4791         Py_DECREF(PyList_GET_ITEM(list, i));
4792         PyList_SET_ITEM(list, i, item);
4793     }
4794
4795     /* Join the words to form a new string */
4796     item = PyUnicode_Join(NULL, list);
4797
4798 onError:
4799     Py_DECREF(list);
4800     return (PyObject *)item;
4801 }
4802 #endif
4803
4804 /* Argument converter.  Coerces to a single unicode character */
4805
4806 static int
4807 convert_uc(PyObject *obj, void *addr)
4808 {
4809         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4810         PyObject *uniobj;
4811         Py_UNICODE *unistr;
4812
4813         uniobj = PyUnicode_FromObject(obj);
4814         if (uniobj == NULL) {
4815                 PyErr_SetString(PyExc_TypeError,
4816                         "The fill character cannot be converted to Unicode");
4817                 return 0;
4818         }
4819         if (PyUnicode_GET_SIZE(uniobj) != 1) {
4820                 PyErr_SetString(PyExc_TypeError,
4821                         "The fill character must be exactly one character long");
4822                 Py_DECREF(uniobj);
4823                 return 0;
4824         }
4825         unistr = PyUnicode_AS_UNICODE(uniobj);
4826         *fillcharloc = unistr[0];
4827         Py_DECREF(uniobj);
4828         return 1;
4829 }
4830
4831 PyDoc_STRVAR(center__doc__,
4832 "S.center(width[, fillchar]) -> unicode\n\
4833 \n\
4834 Return S centered in a Unicode string of length width. Padding is\n\
4835 done using the specified fill character (default is a space)");
4836
4837 static PyObject *
4838 unicode_center(PyUnicodeObject *self, PyObject *args)
4839 {
4840     int marg, left;
4841     int width;
4842     Py_UNICODE fillchar = ' ';
4843
4844     if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
4845         return NULL;
4846
4847     if (self->length >= width && PyUnicode_CheckExact(self)) {
4848         Py_INCREF(self);
4849         return (PyObject*) self;
4850     }
4851
4852     marg = width - self->length;
4853     left = marg / 2 + (marg & width & 1);
4854
4855     return (PyObject*) pad(self, left, marg - left, fillchar);
4856 }
4857
4858 #if 0
4859
4860 /* This code should go into some future Unicode collation support
4861    module. The basic comparison should compare ordinals on a naive
4862    basis (this is what Java does and thus JPython too). */
4863
4864 /* speedy UTF-16 code point order comparison */
4865 /* gleaned from: */
4866 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4867
4868 static short utf16Fixup[32] =
4869 {
4870     0, 0, 0, 0, 0, 0, 0, 0,
4871     0, 0, 0, 0, 0, 0, 0, 0,
4872     0, 0, 0, 0, 0, 0, 0, 0,
4873     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4874 };
4875
4876 static int
4877 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4878 {
4879     int len1, len2;
4880
4881     Py_UNICODE *s1 = str1->str;
4882     Py_UNICODE *s2 = str2->str;
4883
4884     len1 = str1->length;
4885     len2 = str2->length;
4886
4887     while (len1 > 0 && len2 > 0) {
4888         Py_UNICODE c1, c2;
4889
4890         c1 = *s1++;
4891         c2 = *s2++;
4892
4893         if (c1 > (1<<11) * 26)
4894             c1 += utf16Fixup[c1>>11];
4895         if (c2 > (1<<11) * 26)
4896             c2 += utf16Fixup[c2>>11];
4897         /* now c1 and c2 are in UTF-32-compatible order */
4898
4899         if (c1 != c2)
4900             return (c1 < c2) ? -1 : 1;
4901
4902         len1--; len2--;
4903     }
4904
4905     return (len1 < len2) ? -1 : (len1 != len2);
4906 }
4907
4908 #else
4909
4910 static int
4911 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4912 {
4913     register int len1, len2;
4914
4915     Py_UNICODE *s1 = str1->str;
4916     Py_UNICODE *s2 = str2->str;
4917
4918     len1 = str1->length;
4919     len2 = str2->length;
4920
4921     while (len1 > 0 && len2 > 0) {
4922         Py_UNICODE c1, c2;
4923
4924         c1 = *s1++;
4925         c2 = *s2++;
4926
4927         if (c1 != c2)
4928             return (c1 < c2) ? -1 : 1;
4929
4930         len1--; len2--;
4931     }
4932
4933     return (len1 < len2) ? -1 : (len1 != len2);
4934 }
4935
4936 #endif
4937
4938 int PyUnicode_Compare(PyObject *left,
4939                       PyObject *right)
4940 {
4941     PyUnicodeObject *u = NULL, *v = NULL;
4942     int result;
4943
4944     /* Coerce the two arguments */
4945     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4946     if (u == NULL)
4947         goto onError;
4948     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4949     if (v == NULL)
4950         goto onError;
4951
4952     /* Shortcut for empty or interned objects */
4953     if (v == u) {
4954         Py_DECREF(u);
4955         Py_DECREF(v);
4956         return 0;
4957     }
4958
4959     result = unicode_compare(u, v);
4960
4961     Py_DECREF(u);
4962     Py_DECREF(v);
4963     return result;
4964
4965 onError:
4966     Py_XDECREF(u);
4967     Py_XDECREF(v);
4968     return -1;
4969 }
4970
4971 int PyUnicode_Contains(PyObject *container,
4972                        PyObject *element)
4973 {
4974     PyUnicodeObject *u = NULL, *v = NULL;
4975     int result, size;
4976     register const Py_UNICODE *lhs, *end, *rhs;
4977
4978     /* Coerce the two arguments */
4979     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4980     if (v == NULL) {
4981         PyErr_SetString(PyExc_TypeError,
4982             "'in <string>' requires string as left operand");
4983         goto onError;
4984     }
4985     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4986     if (u == NULL)
4987         goto onError;
4988
4989     size = PyUnicode_GET_SIZE(v);
4990     rhs = PyUnicode_AS_UNICODE(v);
4991     lhs = PyUnicode_AS_UNICODE(u);
4992
4993     result = 0;
4994     if (size == 1) {
4995         end = lhs + PyUnicode_GET_SIZE(u);
4996         while (lhs < end) {
4997             if (*lhs++ == *rhs) {
4998                 result = 1;
4999                 break;
5000             }
5001         }
5002     }
5003     else {
5004         end = lhs + (PyUnicode_GET_SIZE(u) - size);
5005         while (lhs <= end) {
5006             if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
5007                 result = 1;
5008                 break;
5009             }
5010         }
5011     }
5012
5013     Py_DECREF(u);
5014     Py_DECREF(v);
5015     return result;
5016
5017 onError:
5018     Py_XDECREF(u);
5019     Py_XDECREF(v);
5020     return -1;
5021 }
5022
5023 /* Concat to string or Unicode object giving a new Unicode object. */
5024
5025 PyObject *PyUnicode_Concat(PyObject *left,
5026                            PyObject *right)
5027 {
5028     PyUnicodeObject *u = NULL, *v = NULL, *w;
5029
5030     /* Coerce the two arguments */
5031     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5032     if (u == NULL)
5033         goto onError;
5034     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5035     if (v == NULL)
5036         goto onError;
5037
5038     /* Shortcuts */
5039     if (v == unicode_empty) {
5040         Py_DECREF(v);
5041         return (PyObject *)u;
5042     }
5043     if (u == unicode_empty) {
5044         Py_DECREF(u);
5045         return (PyObject *)v;
5046     }
5047
5048     /* Concat the two Unicode strings */
5049     w = _PyUnicode_New(u->length + v->length);
5050     if (w == NULL)
5051         goto onError;
5052     Py_UNICODE_COPY(w->str, u->str, u->length);
5053     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5054
5055     Py_DECREF(u);
5056     Py_DECREF(v);
5057     return (PyObject *)w;
5058
5059 onError:
5060     Py_XDECREF(u);
5061     Py_XDECREF(v);
5062     return NULL;
5063 }
5064
5065 PyDoc_STRVAR(count__doc__,
5066 "S.count(sub[, start[, end]]) -> int\n\
5067 \n\
5068 Return the number of occurrences of substring sub in Unicode string\n\
5069 S[start:end].  Optional arguments start and end are\n\
5070 interpreted as in slice notation.");
5071
5072 static PyObject *
5073 unicode_count(PyUnicodeObject *self, PyObject *args)
5074 {
5075     PyUnicodeObject *substring;
5076     int start = 0;
5077     int end = INT_MAX;
5078     PyObject *result;
5079
5080     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5081                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5082         return NULL;
5083
5084     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5085                                                 (PyObject *)substring);
5086     if (substring == NULL)
5087         return NULL;
5088
5089     if (start < 0)
5090         start += self->length;
5091     if (start < 0)
5092         start = 0;
5093     if (end > self->length)
5094         end = self->length;
5095     if (end < 0)
5096         end += self->length;
5097     if (end < 0)
5098         end = 0;
5099
5100     result = PyInt_FromLong((long) count(self, start, end, substring));
5101
5102     Py_DECREF(substring);
5103     return result;
5104 }
5105
5106 PyDoc_STRVAR(encode__doc__,
5107 "S.encode([encoding[,errors]]) -> string or unicode\n\
5108 \n\
5109 Encodes S using the codec registered for encoding. encoding defaults\n\
5110 to the default encoding. errors may be given to set a different error\n\
5111 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5112 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5113 'xmlcharrefreplace' as well as any other name registered with\n\
5114 codecs.register_error that can handle UnicodeEncodeErrors.");
5115
5116 static PyObject *
5117 unicode_encode(PyUnicodeObject *self, PyObject *args)
5118 {
5119     char *encoding = NULL;
5120     char *errors = NULL;
5121     PyObject *v;
5122
5123     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5124         return NULL;
5125     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5126     if (v == NULL)
5127         goto onError;
5128     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5129         PyErr_Format(PyExc_TypeError,
5130                      "encoder did not return a string/unicode object "
5131                      "(type=%.400s)",
5132                      v->ob_type->tp_name);
5133         Py_DECREF(v);
5134         return NULL;
5135     }
5136     return v;
5137
5138  onError:
5139     return NULL;
5140 }
5141
5142 PyDoc_STRVAR(decode__doc__,
5143 "S.decode([encoding[,errors]]) -> string or unicode\n\
5144 \n\
5145 Decodes S using the codec registered for encoding. encoding defaults\n\
5146 to the default encoding. errors may be given to set a different error\n\
5147 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5148 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5149 as well as any other name registerd with codecs.register_error that is\n\
5150 able to handle UnicodeDecodeErrors.");
5151
5152 static PyObject *
5153 unicode_decode(PyUnicodeObject *self, PyObject *args)
5154 {
5155     char *encoding = NULL;
5156     char *errors = NULL;
5157     PyObject *v;
5158
5159     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5160         return NULL;
5161     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5162     if (v == NULL)
5163         goto onError;
5164     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5165         PyErr_Format(PyExc_TypeError,
5166                      "decoder did not return a string/unicode object "
5167                      "(type=%.400s)",
5168                      v->ob_type->tp_name);
5169         Py_DECREF(v);
5170         return NULL;
5171     }
5172     return v;
5173
5174  onError:
5175     return NULL;
5176 }
5177
5178 PyDoc_STRVAR(expandtabs__doc__,
5179 "S.expandtabs([tabsize]) -> unicode\n\
5180 \n\
5181 Return a copy of S where all tab characters are expanded using spaces.\n\
5182 If tabsize is not given, a tab size of 8 characters is assumed.");
5183
5184 static PyObject*
5185 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5186 {
5187     Py_UNICODE *e;
5188     Py_UNICODE *p;
5189     Py_UNICODE *q;
5190     int i, j;
5191     PyUnicodeObject *u;
5192     int tabsize = 8;
5193
5194     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5195         return NULL;
5196
5197     /* First pass: determine size of output string */
5198     i = j = 0;
5199     e = self->str + self->length;
5200     for (p = self->str; p < e; p++)
5201         if (*p == '\t') {
5202             if (tabsize > 0)
5203                 j += tabsize - (j % tabsize);
5204         }
5205         else {
5206             j++;
5207             if (*p == '\n' || *p == '\r') {
5208                 i += j;
5209                 j = 0;
5210             }
5211         }
5212
5213     /* Second pass: create output string and fill it */
5214     u = _PyUnicode_New(i + j);
5215     if (!u)
5216         return NULL;
5217
5218     j = 0;
5219     q = u->str;
5220
5221     for (p = self->str; p < e; p++)
5222         if (*p == '\t') {
5223             if (tabsize > 0) {
5224                 i = tabsize - (j % tabsize);
5225                 j += i;
5226                 while (i--)
5227                     *q++ = ' ';
5228             }
5229         }
5230         else {
5231             j++;
5232             *q++ = *p;
5233             if (*p == '\n' || *p == '\r')
5234                 j = 0;
5235         }
5236
5237     return (PyObject*) u;
5238 }
5239
5240 PyDoc_STRVAR(find__doc__,
5241 "S.find(sub [,start [,end]]) -> int\n\
5242 \n\
5243 Return the lowest index in S where substring sub is found,\n\
5244 such that sub is contained within s[start,end].  Optional\n\
5245 arguments start and end are interpreted as in slice notation.\n\
5246 \n\
5247 Return -1 on failure.");
5248
5249 static PyObject *
5250 unicode_find(PyUnicodeObject *self, PyObject *args)
5251 {
5252     PyUnicodeObject *substring;
5253     int start = 0;
5254     int end = INT_MAX;
5255     PyObject *result;
5256
5257     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5258                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5259         return NULL;
5260     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5261                                                 (PyObject *)substring);
5262     if (substring == NULL)
5263         return NULL;
5264
5265     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5266
5267     Py_DECREF(substring);
5268     return result;
5269 }
5270
5271 static PyObject *
5272 unicode_getitem(PyUnicodeObject *self, int index)
5273 {
5274     if (index < 0 || index >= self->length) {
5275         PyErr_SetString(PyExc_IndexError, "string index out of range");
5276         return NULL;
5277     }
5278
5279     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5280 }
5281
5282 static long
5283 unicode_hash(PyUnicodeObject *self)
5284 {
5285     /* Since Unicode objects compare equal to their ASCII string
5286        counterparts, they should use the individual character values
5287        as basis for their hash value.  This is needed to assure that
5288        strings and Unicode objects behave in the same way as
5289        dictionary keys. */
5290
5291     register int len;
5292     register Py_UNICODE *p;
5293     register long x;
5294
5295     if (self->hash != -1)
5296         return self->hash;
5297     len = PyUnicode_GET_SIZE(self);
5298     p = PyUnicode_AS_UNICODE(self);
5299     x = *p << 7;
5300     while (--len >= 0)
5301         x = (1000003*x) ^ *p++;
5302     x ^= PyUnicode_GET_SIZE(self);
5303     if (x == -1)
5304         x = -2;
5305     self->hash = x;
5306     return x;
5307 }
5308
5309 PyDoc_STRVAR(index__doc__,
5310 "S.index(sub [,start [,end]]) -> int\n\
5311 \n\
5312 Like S.find() but raise ValueError when the substring is not found.");
5313
5314 static PyObject *
5315 unicode_index(PyUnicodeObject *self, PyObject *args)
5316 {
5317     int result;
5318     PyUnicodeObject *substring;
5319     int start = 0;
5320     int end = INT_MAX;
5321
5322     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5323                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5324         return NULL;
5325
5326     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5327                                                 (PyObject *)substring);
5328     if (substring == NULL)
5329         return NULL;
5330
5331     result = findstring(self, substring, start, end, 1);
5332
5333     Py_DECREF(substring);
5334     if (result < 0) {
5335         PyErr_SetString(PyExc_ValueError, "substring not found");
5336         return NULL;
5337     }
5338     return PyInt_FromLong(result);
5339 }
5340
5341 PyDoc_STRVAR(islower__doc__,
5342 "S.islower() -> bool\n\
5343 \n\
5344 Return True if all cased characters in S are lowercase and there is\n\
5345 at least one cased character in S, False otherwise.");
5346
5347 static PyObject*
5348 unicode_islower(PyUnicodeObject *self)
5349 {
5350     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5351     register const Py_UNICODE *e;
5352     int cased;
5353
5354     /* Shortcut for single character strings */
5355     if (PyUnicode_GET_SIZE(self) == 1)
5356         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5357
5358     /* Special case for empty strings */
5359     if (PyString_GET_SIZE(self) == 0)
5360         return PyBool_FromLong(0);
5361
5362     e = p + PyUnicode_GET_SIZE(self);
5363     cased = 0;
5364     for (; p < e; p++) {
5365         register const Py_UNICODE ch = *p;
5366
5367         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5368             return PyBool_FromLong(0);
5369         else if (!cased && Py_UNICODE_ISLOWER(ch))
5370             cased = 1;
5371     }
5372     return PyBool_FromLong(cased);
5373 }
5374
5375 PyDoc_STRVAR(isupper__doc__,
5376 "S.isupper() -> bool\n\
5377 \n\
5378 Return True if all cased characters in S are uppercase and there is\n\
5379 at least one cased character in S, False otherwise.");
5380
5381 static PyObject*
5382 unicode_isupper(PyUnicodeObject *self)
5383 {
5384     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385     register const Py_UNICODE *e;
5386     int cased;
5387
5388     /* Shortcut for single character strings */
5389     if (PyUnicode_GET_SIZE(self) == 1)
5390         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5391
5392     /* Special case for empty strings */
5393     if (PyString_GET_SIZE(self) == 0)
5394         return PyBool_FromLong(0);
5395
5396     e = p + PyUnicode_GET_SIZE(self);
5397     cased = 0;
5398     for (; p < e; p++) {
5399         register const Py_UNICODE ch = *p;
5400
5401         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5402             return PyBool_FromLong(0);
5403         else if (!cased && Py_UNICODE_ISUPPER(ch))
5404             cased = 1;
5405     }
5406     return PyBool_FromLong(cased);
5407 }
5408
5409 PyDoc_STRVAR(istitle__doc__,
5410 "S.istitle() -> bool\n\
5411 \n\
5412 Return True if S is a titlecased string and there is at least one\n\
5413 character in S, i.e. upper- and titlecase characters may only\n\
5414 follow uncased characters and lowercase characters only cased ones.\n\
5415 Return False otherwise.");
5416
5417 static PyObject*
5418 unicode_istitle(PyUnicodeObject *self)
5419 {
5420     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5421     register const Py_UNICODE *e;
5422     int cased, previous_is_cased;
5423
5424     /* Shortcut for single character strings */
5425     if (PyUnicode_GET_SIZE(self) == 1)
5426         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5427                                (Py_UNICODE_ISUPPER(*p) != 0));
5428
5429     /* Special case for empty strings */
5430     if (PyString_GET_SIZE(self) == 0)
5431         return PyBool_FromLong(0);
5432
5433     e = p + PyUnicode_GET_SIZE(self);
5434     cased = 0;
5435     previous_is_cased = 0;
5436     for (; p < e; p++) {
5437         register const Py_UNICODE ch = *p;
5438
5439         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5440             if (previous_is_cased)
5441                 return PyBool_FromLong(0);
5442             previous_is_cased = 1;
5443             cased = 1;
5444         }
5445         else if (Py_UNICODE_ISLOWER(ch)) {
5446             if (!previous_is_cased)
5447                 return PyBool_FromLong(0);
5448             previous_is_cased = 1;
5449             cased = 1;
5450         }
5451         else
5452             previous_is_cased = 0;
5453     }
5454     return PyBool_FromLong(cased);
5455 }
5456
5457 PyDoc_STRVAR(isspace__doc__,
5458 "S.isspace() -> bool\n\
5459 \n\
5460 Return True if all characters in S are whitespace\n\
5461 and there is at least one character in S, False otherwise.");
5462
5463 static PyObject*
5464 unicode_isspace(PyUnicodeObject *self)
5465 {
5466     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5467     register const Py_UNICODE *e;
5468
5469     /* Shortcut for single character strings */
5470     if (PyUnicode_GET_SIZE(self) == 1 &&
5471         Py_UNICODE_ISSPACE(*p))
5472         return PyBool_FromLong(1);
5473
5474     /* Special case for empty strings */
5475     if (PyString_GET_SIZE(self) == 0)
5476         return PyBool_FromLong(0);
5477
5478     e = p + PyUnicode_GET_SIZE(self);
5479     for (; p < e; p++) {
5480         if (!Py_UNICODE_ISSPACE(*p))
5481             return PyBool_FromLong(0);
5482     }
5483     return PyBool_FromLong(1);
5484 }
5485
5486 PyDoc_STRVAR(isalpha__doc__,
5487 "S.isalpha() -> bool\n\
5488 \n\
5489 Return True if all characters in S are alphabetic\n\
5490 and there is at least one character in S, False otherwise.");
5491
5492 static PyObject*
5493 unicode_isalpha(PyUnicodeObject *self)
5494 {
5495     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5496     register const Py_UNICODE *e;
5497
5498     /* Shortcut for single character strings */
5499     if (PyUnicode_GET_SIZE(self) == 1 &&
5500         Py_UNICODE_ISALPHA(*p))
5501         return PyBool_FromLong(1);
5502
5503     /* Special case for empty strings */
5504     if (PyString_GET_SIZE(self) == 0)
5505         return PyBool_FromLong(0);
5506
5507     e = p + PyUnicode_GET_SIZE(self);
5508     for (; p < e; p++) {
5509         if (!Py_UNICODE_ISALPHA(*p))
5510             return PyBool_FromLong(0);
5511     }
5512     return PyBool_FromLong(1);
5513 }
5514
5515 PyDoc_STRVAR(isalnum__doc__,
5516 "S.isalnum() -> bool\n\
5517 \n\
5518 Return True if all characters in S are alphanumeric\n\
5519 and there is at least one character in S, False otherwise.");
5520
5521 static PyObject*
5522 unicode_isalnum(PyUnicodeObject *self)
5523 {
5524     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5525     register const Py_UNICODE *e;
5526
5527     /* Shortcut for single character strings */
5528     if (PyUnicode_GET_SIZE(self) == 1 &&
5529         Py_UNICODE_ISALNUM(*p))
5530         return PyBool_FromLong(1);
5531
5532     /* Special case for empty strings */
5533     if (PyString_GET_SIZE(self) == 0)
5534         return PyBool_FromLong(0);
5535
5536     e = p + PyUnicode_GET_SIZE(self);
5537     for (; p < e; p++) {
5538         if (!Py_UNICODE_ISALNUM(*p))
5539             return PyBool_FromLong(0);
5540     }
5541     return PyBool_FromLong(1);
5542 }
5543
5544 PyDoc_STRVAR(isdecimal__doc__,
5545 "S.isdecimal() -> bool\n\
5546 \n\
5547 Return True if there are only decimal characters in S,\n\
5548 False otherwise.");
5549
5550 static PyObject*
5551 unicode_isdecimal(PyUnicodeObject *self)
5552 {
5553     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5554     register const Py_UNICODE *e;
5555
5556     /* Shortcut for single character strings */
5557     if (PyUnicode_GET_SIZE(self) == 1 &&
5558         Py_UNICODE_ISDECIMAL(*p))
5559         return PyBool_FromLong(1);
5560
5561     /* Special case for empty strings */
5562     if (PyString_GET_SIZE(self) == 0)
5563         return PyBool_FromLong(0);
5564
5565     e = p + PyUnicode_GET_SIZE(self);
5566     for (; p < e; p++) {
5567         if (!Py_UNICODE_ISDECIMAL(*p))
5568             return PyBool_FromLong(0);
5569     }
5570     return PyBool_FromLong(1);
5571 }
5572
5573 PyDoc_STRVAR(isdigit__doc__,
5574 "S.isdigit() -> bool\n\
5575 \n\
5576 Return True if all characters in S are digits\n\
5577 and there is at least one character in S, False otherwise.");
5578
5579 static PyObject*
5580 unicode_isdigit(PyUnicodeObject *self)
5581 {
5582     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5583     register const Py_UNICODE *e;
5584
5585     /* Shortcut for single character strings */
5586     if (PyUnicode_GET_SIZE(self) == 1 &&
5587         Py_UNICODE_ISDIGIT(*p))
5588         return PyBool_FromLong(1);
5589
5590     /* Special case for empty strings */
5591     if (PyString_GET_SIZE(self) == 0)
5592         return PyBool_FromLong(0);
5593
5594     e = p + PyUnicode_GET_SIZE(self);
5595     for (; p < e; p++) {
5596         if (!Py_UNICODE_ISDIGIT(*p))
5597             return PyBool_FromLong(0);
5598     }
5599     return PyBool_FromLong(1);
5600 }
5601
5602 PyDoc_STRVAR(isnumeric__doc__,
5603 "S.isnumeric() -> bool\n\
5604 \n\
5605 Return True if there are only numeric characters in S,\n\
5606 False otherwise.");
5607
5608 static PyObject*
5609 unicode_isnumeric(PyUnicodeObject *self)
5610 {
5611     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5612     register const Py_UNICODE *e;
5613
5614     /* Shortcut for single character strings */
5615     if (PyUnicode_GET_SIZE(self) == 1 &&
5616         Py_UNICODE_ISNUMERIC(*p))
5617         return PyBool_FromLong(1);
5618
5619     /* Special case for empty strings */
5620     if (PyString_GET_SIZE(self) == 0)
5621         return PyBool_FromLong(0);
5622
5623     e = p + PyUnicode_GET_SIZE(self);
5624     for (; p < e; p++) {
5625         if (!Py_UNICODE_ISNUMERIC(*p))
5626             return PyBool_FromLong(0);
5627     }
5628     return PyBool_FromLong(1);
5629 }
5630
5631 PyDoc_STRVAR(join__doc__,
5632 "S.join(sequence) -> unicode\n\
5633 \n\
5634 Return a string which is the concatenation of the strings in the\n\
5635 sequence.  The separator between elements is S.");
5636
5637 static PyObject*
5638 unicode_join(PyObject *self, PyObject *data)
5639 {
5640     return PyUnicode_Join(self, data);
5641 }
5642
5643 static int
5644 unicode_length(PyUnicodeObject *self)
5645 {
5646     return self->length;
5647 }
5648
5649 PyDoc_STRVAR(ljust__doc__,
5650 "S.ljust(width[, fillchar]) -> int\n\
5651 \n\
5652 Return S left justified in a Unicode string of length width. Padding is\n\
5653 done using the specified fill character (default is a space).");
5654
5655 static PyObject *
5656 unicode_ljust(PyUnicodeObject *self, PyObject *args)
5657 {
5658     int width;
5659     Py_UNICODE fillchar = ' ';
5660
5661     if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
5662         return NULL;
5663
5664     if (self->length >= width && PyUnicode_CheckExact(self)) {
5665         Py_INCREF(self);
5666         return (PyObject*) self;
5667     }
5668
5669     return (PyObject*) pad(self, 0, width - self->length, fillchar);
5670 }
5671
5672 PyDoc_STRVAR(lower__doc__,
5673 "S.lower() -> unicode\n\
5674 \n\
5675 Return a copy of the string S converted to lowercase.");
5676
5677 static PyObject*
5678 unicode_lower(PyUnicodeObject *self)
5679 {
5680     return fixup(self, fixlower);
5681 }
5682
5683 #define LEFTSTRIP 0
5684 #define RIGHTSTRIP 1
5685 #define BOTHSTRIP 2
5686
5687 /* Arrays indexed by above */
5688 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5689
5690 #define STRIPNAME(i) (stripformat[i]+3)
5691
5692 static const Py_UNICODE *
5693 unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5694 {
5695         size_t i;
5696         for (i = 0; i < n; ++i)
5697                 if (s[i] == c)
5698                         return s+i;
5699         return NULL;
5700 }
5701
5702 /* externally visible for str.strip(unicode) */
5703 PyObject *
5704 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5705 {
5706         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5707         int len = PyUnicode_GET_SIZE(self);
5708         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5709         int seplen = PyUnicode_GET_SIZE(sepobj);
5710         int i, j;
5711
5712         i = 0;
5713         if (striptype != RIGHTSTRIP) {
5714                 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5715                         i++;
5716                 }
5717         }
5718
5719         j = len;
5720         if (striptype != LEFTSTRIP) {
5721                 do {
5722                         j--;
5723                 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5724                 j++;
5725         }
5726
5727         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5728                 Py_INCREF(self);
5729                 return (PyObject*)self;
5730         }
5731         else
5732                 return PyUnicode_FromUnicode(s+i, j-i);
5733 }
5734
5735
5736 static PyObject *
5737 do_strip(PyUnicodeObject *self, int striptype)
5738 {
5739         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5740         int len = PyUnicode_GET_SIZE(self), i, j;
5741
5742         i = 0;
5743         if (striptype != RIGHTSTRIP) {
5744                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5745                         i++;
5746                 }
5747         }
5748
5749         j = len;
5750         if (striptype != LEFTSTRIP) {
5751                 do {
5752                         j--;
5753                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5754                 j++;
5755         }
5756
5757         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5758                 Py_INCREF(self);
5759                 return (PyObject*)self;
5760         }
5761         else
5762                 return PyUnicode_FromUnicode(s+i, j-i);
5763 }
5764
5765
5766 static PyObject *
5767 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5768 {
5769         PyObject *sep = NULL;
5770
5771         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5772                 return NULL;
5773
5774         if (sep != NULL && sep != Py_None) {
5775                 if (PyUnicode_Check(sep))
5776                         return _PyUnicode_XStrip(self, striptype, sep);
5777                 else if (PyString_Check(sep)) {
5778                         PyObject *res;
5779                         sep = PyUnicode_FromObject(sep);
5780                         if (sep==NULL)
5781                                 return NULL;
5782                         res = _PyUnicode_XStrip(self, striptype, sep);
5783                         Py_DECREF(sep);
5784                         return res;
5785                 }
5786                 else {
5787                         PyErr_Format(PyExc_TypeError,
5788                                      "%s arg must be None, unicode or str",
5789                                      STRIPNAME(striptype));
5790                         return NULL;
5791                 }
5792         }
5793
5794         return do_strip(self, striptype);
5795 }
5796
5797
5798 PyDoc_STRVAR(strip__doc__,
5799 "S.strip([chars]) -> unicode\n\
5800 \n\
5801 Return a copy of the string S with leading and trailing\n\
5802 whitespace removed.\n\
5803 If chars is given and not None, remove characters in chars instead.\n\
5804 If chars is a str, it will be converted to unicode before stripping");
5805
5806 static PyObject *
5807 unicode_strip(PyUnicodeObject *self, PyObject *args)
5808 {
5809         if (PyTuple_GET_SIZE(args) == 0)
5810                 return do_strip(self, BOTHSTRIP); /* Common case */
5811         else
5812                 return do_argstrip(self, BOTHSTRIP, args);
5813 }
5814
5815
5816 PyDoc_STRVAR(lstrip__doc__,
5817 "S.lstrip([chars]) -> unicode\n\
5818 \n\
5819 Return a copy of the string S with leading whitespace removed.\n\
5820 If chars is given and not None, remove characters in chars instead.\n\
5821 If chars is a str, it will be converted to unicode before stripping");
5822
5823 static PyObject *
5824 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5825 {
5826         if (PyTuple_GET_SIZE(args) == 0)
5827                 return do_strip(self, LEFTSTRIP); /* Common case */
5828         else
5829                 return do_argstrip(self, LEFTSTRIP, args);
5830 }
5831
5832
5833 PyDoc_STRVAR(rstrip__doc__,
5834 "S.rstrip([chars]) -> unicode\n\
5835 \n\
5836 Return a copy of the string S with trailing whitespace removed.\n\
5837 If chars is given and not None, remove characters in chars instead.\n\
5838 If chars is a str, it will be converted to unicode before stripping");
5839
5840 static PyObject *
5841 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5842 {
5843         if (PyTuple_GET_SIZE(args) == 0)
5844                 return do_strip(self, RIGHTSTRIP); /* Common case */
5845         else
5846                 return do_argstrip(self, RIGHTSTRIP, args);
5847 }
5848
5849
5850 static PyObject*
5851 unicode_repeat(PyUnicodeObject *str, int len)
5852 {
5853     PyUnicodeObject *u;
5854     Py_UNICODE *p;
5855     int nchars;
5856     size_t nbytes;
5857
5858     if (len < 0)
5859         len = 0;
5860
5861     if (len == 1 && PyUnicode_CheckExact(str)) {
5862         /* no repeat, return original string */
5863         Py_INCREF(str);
5864         return (PyObject*) str;
5865     }
5866
5867     /* ensure # of chars needed doesn't overflow int and # of bytes
5868      * needed doesn't overflow size_t
5869      */
5870     nchars = len * str->length;
5871     if (len && nchars / len != str->length) {
5872         PyErr_SetString(PyExc_OverflowError,
5873                         "repeated string is too long");
5874         return NULL;
5875     }
5876     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5877     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5878         PyErr_SetString(PyExc_OverflowError,
5879                         "repeated string is too long");
5880         return NULL;
5881     }
5882     u = _PyUnicode_New(nchars);
5883     if (!u)
5884         return NULL;
5885
5886     p = u->str;
5887
5888     while (len-- > 0) {
5889         Py_UNICODE_COPY(p, str->str, str->length);
5890         p += str->length;
5891     }
5892
5893     return (PyObject*) u;
5894 }
5895
5896 PyObject *PyUnicode_Replace(PyObject *obj,
5897                             PyObject *subobj,
5898                             PyObject *replobj,
5899                             int maxcount)
5900 {
5901     PyObject *self;
5902     PyObject *str1;
5903     PyObject *str2;
5904     PyObject *result;
5905
5906     self = PyUnicode_FromObject(obj);
5907     if (self == NULL)
5908         return NULL;
5909     str1 = PyUnicode_FromObject(subobj);
5910     if (str1 == NULL) {
5911         Py_DECREF(self);
5912         return NULL;
5913     }
5914     str2 = PyUnicode_FromObject(replobj);
5915     if (str2 == NULL) {
5916         Py_DECREF(self);
5917         Py_DECREF(str1);
5918         return NULL;
5919     }
5920     result = replace((PyUnicodeObject *)self,
5921                      (PyUnicodeObject *)str1,
5922                      (PyUnicodeObject *)str2,
5923                      maxcount);
5924     Py_DECREF(self);
5925     Py_DECREF(str1);
5926     Py_DECREF(str2);
5927     return result;
5928 }
5929
5930 PyDoc_STRVAR(replace__doc__,
5931 "S.replace (old, new[, maxsplit]) -> unicode\n\
5932 \n\
5933 Return a copy of S with all occurrences of substring\n\
5934 old replaced by new.  If the optional argument maxsplit is\n\
5935 given, only the first maxsplit occurrences are replaced.");
5936
5937 static PyObject*
5938 unicode_replace(PyUnicodeObject *self, PyObject *args)
5939 {
5940     PyUnicodeObject *str1;
5941     PyUnicodeObject *str2;
5942     int maxcount = -1;
5943     PyObject *result;
5944
5945     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5946         return NULL;
5947     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5948     if (str1 == NULL)
5949         return NULL;
5950     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5951     if (str2 == NULL) {
5952         Py_DECREF(str1);
5953         return NULL;
5954     }
5955
5956     result = replace(self, str1, str2, maxcount);
5957
5958     Py_DECREF(str1);
5959     Py_DECREF(str2);
5960     return result;
5961 }
5962
5963 static
5964 PyObject *unicode_repr(PyObject *unicode)
5965 {
5966     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5967                                 PyUnicode_GET_SIZE(unicode),
5968                                 1);
5969 }
5970
5971 PyDoc_STRVAR(rfind__doc__,
5972 "S.rfind(sub [,start [,end]]) -> int\n\
5973 \n\
5974 Return the highest index in S where substring sub is found,\n\
5975 such that sub is contained within s[start,end].  Optional\n\
5976 arguments start and end are interpreted as in slice notation.\n\
5977 \n\
5978 Return -1 on failure.");
5979
5980 static PyObject *
5981 unicode_rfind(PyUnicodeObject *self, PyObject *args)
5982 {
5983     PyUnicodeObject *substring;
5984     int start = 0;
5985     int end = INT_MAX;
5986     PyObject *result;
5987
5988     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5989                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5990         return NULL;
5991     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5992                                                 (PyObject *)substring);
5993     if (substring == NULL)
5994         return NULL;
5995
5996     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5997
5998     Py_DECREF(substring);
5999     return result;
6000 }
6001
6002 PyDoc_STRVAR(rindex__doc__,
6003 "S.rindex(sub [,start [,end]]) -> int\n\
6004 \n\
6005 Like S.rfind() but raise ValueError when the substring is not found.");
6006
6007 static PyObject *
6008 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6009 {
6010     int result;
6011     PyUnicodeObject *substring;
6012     int start = 0;
6013     int end = INT_MAX;
6014
6015     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6016                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6017         return NULL;
6018     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6019                                                 (PyObject *)substring);
6020     if (substring == NULL)
6021         return NULL;
6022
6023     result = findstring(self, substring, start, end, -1);
6024
6025     Py_DECREF(substring);
6026     if (result < 0) {
6027         PyErr_SetString(PyExc_ValueError, "substring not found");
6028         return NULL;
6029     }
6030     return PyInt_FromLong(result);
6031 }
6032
6033 PyDoc_STRVAR(rjust__doc__,
6034 "S.rjust(width[, fillchar]) -> unicode\n\
6035 \n\
6036 Return S right justified in a Unicode string of length width. Padding is\n\
6037 done using the specified fill character (default is a space).");
6038
6039 static PyObject *
6040 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6041 {
6042     int width;
6043     Py_UNICODE fillchar = ' ';
6044
6045     if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
6046         return NULL;
6047
6048     if (self->length >= width && PyUnicode_CheckExact(self)) {
6049         Py_INCREF(self);
6050         return (PyObject*) self;
6051     }
6052
6053     return (PyObject*) pad(self, width - self->length, 0, fillchar);
6054 }
6055
6056 static PyObject*
6057 unicode_slice(PyUnicodeObject *self, int start, int end)
6058 {
6059     /* standard clamping */
6060     if (start < 0)
6061         start = 0;
6062     if (end < 0)
6063         end = 0;
6064     if (end > self->length)
6065         end = self->length;
6066     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6067         /* full slice, return original string */
6068         Py_INCREF(self);
6069         return (PyObject*) self;
6070     }
6071     if (start > end)
6072         start = end;
6073     /* copy slice */
6074     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6075                                              end - start);
6076 }
6077
6078 PyObject *PyUnicode_Split(PyObject *s,
6079                           PyObject *sep,
6080                           int maxsplit)
6081 {
6082     PyObject *result;
6083
6084     s = PyUnicode_FromObject(s);
6085     if (s == NULL)
6086         return NULL;
6087     if (sep != NULL) {
6088         sep = PyUnicode_FromObject(sep);
6089         if (sep == NULL) {
6090             Py_DECREF(s);
6091             return NULL;
6092         }
6093     }
6094
6095     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6096
6097     Py_DECREF(s);
6098     Py_XDECREF(sep);
6099     return result;
6100 }
6101
6102 PyDoc_STRVAR(split__doc__,
6103 "S.split([sep [,maxsplit]]) -> list of strings\n\
6104 \n\
6105 Return a list of the words in S, using sep as the\n\
6106 delimiter string.  If maxsplit is given, at most maxsplit\n\
6107 splits are done. If sep is not specified or is None,\n\
6108 any whitespace string is a separator.");
6109
6110 static PyObject*
6111 unicode_split(PyUnicodeObject *self, PyObject *args)
6112 {
6113     PyObject *substring = Py_None;
6114     int maxcount = -1;
6115
6116     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6117         return NULL;
6118
6119     if (substring == Py_None)
6120         return split(self, NULL, maxcount);
6121     else if (PyUnicode_Check(substring))
6122         return split(self, (PyUnicodeObject *)substring, maxcount);
6123     else
6124         return PyUnicode_Split((PyObject *)self, substring, maxcount);
6125 }
6126
6127 PyObject *PyUnicode_RSplit(PyObject *s,
6128                            PyObject *sep,
6129                            int maxsplit)
6130 {
6131     PyObject *result;
6132
6133     s = PyUnicode_FromObject(s);
6134     if (s == NULL)
6135         return NULL;
6136     if (sep != NULL) {
6137         sep = PyUnicode_FromObject(sep);
6138         if (sep == NULL) {
6139             Py_DECREF(s);
6140             return NULL;
6141         }
6142     }
6143
6144     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6145
6146     Py_DECREF(s);
6147     Py_XDECREF(sep);
6148     return result;
6149 }
6150
6151 PyDoc_STRVAR(rsplit__doc__,
6152 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6153 \n\
6154 Return a list of the words in S, using sep as the\n\
6155 delimiter string, starting at the end of the string and\n\
6156 working to the front.  If maxsplit is given, at most maxsplit\n\
6157 splits are done. If sep is not specified, any whitespace string\n\
6158 is a separator.");
6159
6160 static PyObject*
6161 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6162 {
6163     PyObject *substring = Py_None;
6164     int maxcount = -1;
6165
6166     if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6167         return NULL;
6168
6169     if (substring == Py_None)
6170         return rsplit(self, NULL, maxcount);
6171     else if (PyUnicode_Check(substring))
6172         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6173     else
6174         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6175 }
6176
6177 PyDoc_STRVAR(splitlines__doc__,
6178 "S.splitlines([keepends]]) -> list of strings\n\
6179 \n\
6180 Return a list of the lines in S, breaking at line boundaries.\n\
6181 Line breaks are not included in the resulting list unless keepends\n\
6182 is given and true.");
6183
6184 static PyObject*
6185 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6186 {
6187     int keepends = 0;
6188
6189     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6190         return NULL;
6191
6192     return PyUnicode_Splitlines((PyObject *)self, keepends);
6193 }
6194
6195 static
6196 PyObject *unicode_str(PyUnicodeObject *self)
6197 {
6198     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6199 }
6200
6201 PyDoc_STRVAR(swapcase__doc__,
6202 "S.swapcase() -> unicode\n\
6203 \n\
6204 Return a copy of S with uppercase characters converted to lowercase\n\
6205 and vice versa.");
6206
6207 static PyObject*
6208 unicode_swapcase(PyUnicodeObject *self)
6209 {
6210     return fixup(self, fixswapcase);
6211 }
6212
6213 PyDoc_STRVAR(translate__doc__,
6214 "S.translate(table) -> unicode\n\
6215 \n\
6216 Return a copy of the string S, where all characters have been mapped\n\
6217 through the given translation table, which must be a mapping of\n\
6218 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6219 Unmapped characters are left untouched. Characters mapped to None\n\
6220 are deleted.");
6221
6222 static PyObject*
6223 unicode_translate(PyUnicodeObject *self, PyObject *table)
6224 {
6225     return PyUnicode_TranslateCharmap(self->str,
6226                                       self->length,
6227                                       table,
6228                                       "ignore");
6229 }
6230
6231 PyDoc_STRVAR(upper__doc__,
6232 "S.upper() -> unicode\n\
6233 \n\
6234 Return a copy of S converted to uppercase.");
6235
6236 static PyObject*
6237 unicode_upper(PyUnicodeObject *self)
6238 {
6239     return fixup(self, fixupper);
6240 }
6241
6242 PyDoc_STRVAR(zfill__doc__,
6243 "S.zfill(width) -> unicode\n\
6244 \n\
6245 Pad a numeric string x with zeros on the left, to fill a field\n\
6246 of the specified width. The string x is never truncated.");
6247
6248 static PyObject *
6249 unicode_zfill(PyUnicodeObject *self, PyObject *args)
6250 {
6251     int fill;
6252     PyUnicodeObject *u;
6253
6254     int width;
6255     if (!PyArg_ParseTuple(args, "i:zfill", &width))
6256         return NULL;
6257
6258     if (self->length >= width) {
6259         if (PyUnicode_CheckExact(self)) {
6260             Py_INCREF(self);
6261             return (PyObject*) self;
6262         }
6263         else
6264             return PyUnicode_FromUnicode(
6265                 PyUnicode_AS_UNICODE(self),
6266                 PyUnicode_GET_SIZE(self)
6267             );
6268     }
6269
6270     fill = width - self->length;
6271
6272     u = pad(self, fill, 0, '0');
6273
6274     if (u == NULL)
6275         return NULL;
6276
6277     if (u->str[fill] == '+' || u->str[fill] == '-') {
6278         /* move sign to beginning of string */
6279         u->str[0] = u->str[fill];
6280         u->str[fill] = '0';
6281     }
6282
6283     return (PyObject*) u;
6284 }
6285
6286 #if 0
6287 static PyObject*
6288 unicode_freelistsize(PyUnicodeObject *self)
6289 {
6290     return PyInt_FromLong(unicode_freelist_size);
6291 }
6292 #endif
6293
6294 PyDoc_STRVAR(startswith__doc__,
6295 "S.startswith(prefix[, start[, end]]) -> bool\n\
6296 \n\
6297 Return True if S starts with the specified prefix, False otherwise.\n\
6298 With optional start, test S beginning at that position.\n\
6299 With optional end, stop comparing S at that position.");
6300
6301 static PyObject *
6302 unicode_startswith(PyUnicodeObject *self,
6303                    PyObject *args)
6304 {
6305     PyUnicodeObject *substring;
6306     int start = 0;
6307     int end = INT_MAX;
6308     PyObject *result;
6309
6310     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6311                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6312         return NULL;
6313     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6314                                                 (PyObject *)substring);
6315     if (substring == NULL)
6316         return NULL;
6317
6318     result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
6319
6320     Py_DECREF(substring);
6321     return result;
6322 }
6323
6324
6325 PyDoc_STRVAR(endswith__doc__,
6326 "S.endswith(suffix[, start[, end]]) -> bool\n\
6327 \n\
6328 Return True if S ends with the specified suffix, False otherwise.\n\
6329 With optional start, test S beginning at that position.\n\
6330 With optional end, stop comparing S at that position.");
6331
6332 static PyObject *
6333 unicode_endswith(PyUnicodeObject *self,
6334                  PyObject *args)
6335 {
6336     PyUnicodeObject *substring;
6337     int start = 0;
6338     int end = INT_MAX;
6339     PyObject *result;
6340
6341     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6342                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6343         return NULL;
6344     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6345                                                 (PyObject *)substring);
6346     if (substring == NULL)
6347         return NULL;
6348
6349     result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
6350
6351     Py_DECREF(substring);
6352     return result;
6353 }
6354
6355
6356
6357 static PyObject *
6358 unicode_getnewargs(PyUnicodeObject *v)
6359 {
6360         return Py_BuildValue("(u#)", v->str, v->length);
6361 }
6362
6363
6364 static PyMethodDef unicode_methods[] = {
6365
6366     /* Order is according to common usage: often used methods should
6367        appear first, since lookup is done sequentially. */
6368
6369     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6370     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6371     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6372     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6373     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6374     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6375     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6376     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6377     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6378     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6379     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6380     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6381     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6382     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
6383     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
6384     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
6385 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6386     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6387     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6388     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
6389     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
6390     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
6391     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
6392     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6393     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6394     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6395     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6396     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6397     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6398     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6399     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6400     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6401     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6402     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6403     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6404     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6405     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
6406     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
6407 #if 0
6408     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
6409 #endif
6410
6411 #if 0
6412     /* This one is just used for debugging the implementation. */
6413     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
6414 #endif
6415
6416     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
6417     {NULL, NULL}
6418 };
6419
6420 static PyObject *
6421 unicode_mod(PyObject *v, PyObject *w)
6422 {
6423        if (!PyUnicode_Check(v)) {
6424                Py_INCREF(Py_NotImplemented);
6425                return Py_NotImplemented;
6426        }
6427        return PyUnicode_Format(v, w);
6428 }
6429
6430 static PyNumberMethods unicode_as_number = {
6431         0,                              /*nb_add*/
6432         0,                              /*nb_subtract*/
6433         0,                              /*nb_multiply*/
6434         0,                              /*nb_divide*/
6435         unicode_mod,                    /*nb_remainder*/
6436 };
6437
6438 static PySequenceMethods unicode_as_sequence = {
6439     (inquiry) unicode_length,           /* sq_length */
6440     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
6441     (intargfunc) unicode_repeat,        /* sq_repeat */
6442     (intargfunc) unicode_getitem,       /* sq_item */
6443     (intintargfunc) unicode_slice,      /* sq_slice */
6444     0,                                  /* sq_ass_item */
6445     0,                                  /* sq_ass_slice */
6446     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
6447 };
6448
6449 static PyObject*
6450 unicode_subscript(PyUnicodeObject* self, PyObject* item)
6451 {
6452     if (PyInt_Check(item)) {
6453         long i = PyInt_AS_LONG(item);
6454         if (i < 0)
6455             i += PyString_GET_SIZE(self);
6456         return unicode_getitem(self, i);
6457     } else if (PyLong_Check(item)) {
6458         long i = PyLong_AsLong(item);
6459         if (i == -1 && PyErr_Occurred())
6460             return NULL;
6461         if (i < 0)
6462             i += PyString_GET_SIZE(self);
6463         return unicode_getitem(self, i);
6464     } else if (PySlice_Check(item)) {
6465         int start, stop, step, slicelength, cur, i;
6466         Py_UNICODE* source_buf;
6467         Py_UNICODE* result_buf;
6468         PyObject* result;
6469
6470         if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6471                                  &start, &stop, &step, &slicelength) < 0) {
6472             return NULL;
6473         }
6474
6475         if (slicelength <= 0) {
6476             return PyUnicode_FromUnicode(NULL, 0);
6477         } else {
6478             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6479             result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6480
6481             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6482                 result_buf[i] = source_buf[cur];
6483             }
6484
6485             result = PyUnicode_FromUnicode(result_buf, slicelength);
6486             PyMem_FREE(result_buf);
6487             return result;
6488         }
6489     } else {
6490         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6491         return NULL;
6492     }
6493 }
6494
6495 static PyMappingMethods unicode_as_mapping = {
6496     (inquiry)unicode_length,            /* mp_length */
6497     (binaryfunc)unicode_subscript,      /* mp_subscript */
6498     (objobjargproc)0,                   /* mp_ass_subscript */
6499 };
6500
6501 static int
6502 unicode_buffer_getreadbuf(PyUnicodeObject *self,
6503                           int index,
6504                           const void **ptr)
6505 {
6506     if (index != 0) {
6507         PyErr_SetString(PyExc_SystemError,
6508                         "accessing non-existent unicode segment");
6509         return -1;
6510     }
6511     *ptr = (void *) self->str;
6512     return PyUnicode_GET_DATA_SIZE(self);
6513 }
6514
6515 static int
6516 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6517                            const void **ptr)
6518 {
6519     PyErr_SetString(PyExc_TypeError,
6520                     "cannot use unicode as modifiable buffer");
6521     return -1;
6522 }
6523
6524 static int
6525 unicode_buffer_getsegcount(PyUnicodeObject *self,
6526                            int *lenp)
6527 {
6528     if (lenp)
6529         *lenp = PyUnicode_GET_DATA_SIZE(self);
6530     return 1;
6531 }
6532
6533 static int
6534 unicode_buffer_getcharbuf(PyUnicodeObject *self,
6535                           int index,
6536                           const void **ptr)
6537 {
6538     PyObject *str;
6539
6540     if (index != 0) {
6541         PyErr_SetString(PyExc_SystemError,
6542                         "accessing non-existent unicode segment");
6543         return -1;
6544     }
6545     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
6546     if (str == NULL)
6547         return -1;
6548     *ptr = (void *) PyString_AS_STRING(str);
6549     return PyString_GET_SIZE(str);
6550 }
6551
6552 /* Helpers for PyUnicode_Format() */
6553
6554 static PyObject *
6555 getnextarg(PyObject *args, int arglen, int *p_argidx)
6556 {
6557     int argidx = *p_argidx;
6558     if (argidx < arglen) {
6559         (*p_argidx)++;
6560         if (arglen < 0)
6561             return args;
6562         else
6563             return PyTuple_GetItem(args, argidx);
6564     }
6565     PyErr_SetString(PyExc_TypeError,
6566                     "not enough arguments for format string");
6567     return NULL;
6568 }
6569
6570 #define F_LJUST (1<<0)
6571 #define F_SIGN  (1<<1)
6572 #define F_BLANK (1<<2)
6573 #define F_ALT   (1<<3)
6574 #define F_ZERO  (1<<4)
6575
6576 static
6577 int usprintf(register Py_UNICODE *buffer, char *format, ...)
6578 {
6579     register int i;
6580     int len;
6581     va_list va;
6582     char *charbuffer;
6583     va_start(va, format);
6584
6585     /* First, format the string as char array, then expand to Py_UNICODE
6586        array. */
6587     charbuffer = (char *)buffer;
6588     len = vsprintf(charbuffer, format, va);
6589     for (i = len - 1; i >= 0; i--)
6590         buffer[i] = (Py_UNICODE) charbuffer[i];
6591
6592     va_end(va);
6593     return len;
6594 }
6595
6596 /* XXX To save some code duplication, formatfloat/long/int could have been
6597    shared with stringobject.c, converting from 8-bit to Unicode after the
6598    formatting is done. */
6599
6600 static int
6601 formatfloat(Py_UNICODE *buf,
6602             size_t buflen,
6603             int flags,
6604             int prec,
6605             int type,
6606             PyObject *v)
6607 {
6608     /* fmt = '%#.' + `prec` + `type`
6609        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6610     char fmt[20];
6611     double x;
6612
6613     x = PyFloat_AsDouble(v);
6614     if (x == -1.0 && PyErr_Occurred())
6615         return -1;
6616     if (prec < 0)
6617         prec = 6;
6618     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6619         type = 'g';
6620     /* Worst case length calc to ensure no buffer overrun:
6621
6622        'g' formats:
6623          fmt = %#.<prec>g
6624          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6625             for any double rep.)
6626          len = 1 + prec + 1 + 2 + 5 = 9 + prec
6627
6628        'f' formats:
6629          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6630          len = 1 + 50 + 1 + prec = 52 + prec
6631
6632        If prec=0 the effective precision is 1 (the leading digit is
6633        always given), therefore increase the length by one.
6634
6635     */
6636     if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6637         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6638         PyErr_SetString(PyExc_OverflowError,
6639                         "formatted float is too long (precision too large?)");
6640         return -1;
6641     }
6642     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6643                   (flags&F_ALT) ? "#" : "",
6644                   prec, type);
6645     return usprintf(buf, fmt, x);
6646 }
6647
6648 static PyObject*
6649 formatlong(PyObject *val, int flags, int prec, int type)
6650 {
6651         char *buf;
6652         int i, len;
6653         PyObject *str; /* temporary string object. */
6654         PyUnicodeObject *result;
6655
6656         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6657         if (!str)
6658                 return NULL;
6659         result = _PyUnicode_New(len);
6660         for (i = 0; i < len; i++)
6661                 result->str[i] = buf[i];
6662         result->str[len] = 0;
6663         Py_DECREF(str);
6664         return (PyObject*)result;
6665 }
6666
6667 static int
6668 formatint(Py_UNICODE *buf,
6669           size_t buflen,
6670           int flags,
6671           int prec,
6672           int type,
6673           PyObject *v)
6674 {
6675     /* fmt = '%#.' + `prec` + 'l' + `type`
6676      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6677      *                     + 1 + 1
6678      *                   = 24
6679      */
6680     char fmt[64]; /* plenty big enough! */
6681     char *sign;
6682     long x;
6683
6684     x = PyInt_AsLong(v);
6685     if (x == -1 && PyErr_Occurred())
6686         return -1;
6687     if (x < 0 && type == 'u') {
6688         type = 'd';
6689     }
6690     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6691         sign = "-";
6692     else
6693         sign = "";
6694     if (prec < 0)
6695         prec = 1;
6696
6697     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6698      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
6699      */
6700     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
6701         PyErr_SetString(PyExc_OverflowError,
6702                 "formatted integer is too long (precision too large?)");
6703         return -1;
6704     }
6705
6706     if ((flags & F_ALT) &&
6707         (type == 'x' || type == 'X')) {
6708         /* When converting under %#x or %#X, there are a number
6709          * of issues that cause pain:
6710          * - when 0 is being converted, the C standard leaves off
6711          *   the '0x' or '0X', which is inconsistent with other
6712          *   %#x/%#X conversions and inconsistent with Python's
6713          *   hex() function
6714          * - there are platforms that violate the standard and
6715          *   convert 0 with the '0x' or '0X'
6716          *   (Metrowerks, Compaq Tru64)
6717          * - there are platforms that give '0x' when converting
6718          *   under %#X, but convert 0 in accordance with the
6719          *   standard (OS/2 EMX)
6720          *
6721          * We can achieve the desired consistency by inserting our
6722          * own '0x' or '0X' prefix, and substituting %x/%X in place
6723          * of %#x/%#X.
6724          *
6725          * Note that this is the same approach as used in
6726          * formatint() in stringobject.c
6727          */
6728         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6729                       sign, type, prec, type);
6730     }
6731     else {
6732         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6733                       sign, (flags&F_ALT) ? "#" : "",
6734                       prec, type);
6735     }
6736     if (sign[0])
6737         return usprintf(buf, fmt, -x);
6738     else
6739         return usprintf(buf, fmt, x);
6740 }
6741
6742 static int
6743 formatchar(Py_UNICODE *buf,
6744            size_t buflen,
6745            PyObject *v)
6746 {
6747     /* presume that the buffer is at least 2 characters long */
6748     if (PyUnicode_Check(v)) {
6749         if (PyUnicode_GET_SIZE(v) != 1)
6750             goto onError;
6751         buf[0] = PyUnicode_AS_UNICODE(v)[0];
6752     }
6753
6754     else if (PyString_Check(v)) {
6755         if (PyString_GET_SIZE(v) != 1)
6756             goto onError;
6757         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6758     }
6759
6760     else {
6761         /* Integer input truncated to a character */
6762         long x;
6763         x = PyInt_AsLong(v);
6764         if (x == -1 && PyErr_Occurred())
6765             goto onError;
6766 #ifdef Py_UNICODE_WIDE
6767         if (x < 0 || x > 0x10ffff) {
6768             PyErr_SetString(PyExc_OverflowError,
6769                             "%c arg not in range(0x110000) "
6770                             "(wide Python build)");
6771             return -1;
6772         }
6773 #else
6774         if (x < 0 || x > 0xffff) {
6775             PyErr_SetString(PyExc_OverflowError,
6776                             "%c arg not in range(0x10000) "
6777                             "(narrow Python build)");
6778             return -1;
6779         }
6780 #endif
6781         buf[0] = (Py_UNICODE) x;
6782     }
6783     buf[1] = '\0';
6784     return 1;
6785
6786  onError:
6787     PyErr_SetString(PyExc_TypeError,
6788                     "%c requires int or char");
6789     return -1;
6790 }
6791
6792 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6793
6794    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6795    chars are formatted. XXX This is a magic number. Each formatting
6796    routine does bounds checking to ensure no overflow, but a better
6797    solution may be to malloc a buffer of appropriate size for each
6798    format. For now, the current solution is sufficient.
6799 */
6800 #define FORMATBUFLEN (size_t)120
6801
6802 PyObject *PyUnicode_Format(PyObject *format,
6803                            PyObject *args)
6804 {
6805     Py_UNICODE *fmt, *res;
6806     int fmtcnt, rescnt, reslen, arglen, argidx;
6807     int args_owned = 0;
6808     PyUnicodeObject *result = NULL;
6809     PyObject *dict = NULL;
6810     PyObject *uformat;
6811
6812     if (format == NULL || args == NULL) {
6813         PyErr_BadInternalCall();
6814         return NULL;
6815     }
6816     uformat = PyUnicode_FromObject(format);
6817     if (uformat == NULL)
6818         return NULL;
6819     fmt = PyUnicode_AS_UNICODE(uformat);
6820     fmtcnt = PyUnicode_GET_SIZE(uformat);
6821
6822     reslen = rescnt = fmtcnt + 100;
6823     result = _PyUnicode_New(reslen);
6824     if (result == NULL)
6825         goto onError;
6826     res = PyUnicode_AS_UNICODE(result);
6827
6828     if (PyTuple_Check(args)) {
6829         arglen = PyTuple_Size(args);
6830         argidx = 0;
6831     }
6832     else {
6833         arglen = -1;
6834         argidx = -2;
6835     }
6836     if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6837         !PyObject_TypeCheck(args, &PyBaseString_Type))
6838         dict = args;
6839
6840     while (--fmtcnt >= 0) {
6841         if (*fmt != '%') {
6842             if (--rescnt < 0) {
6843                 rescnt = fmtcnt + 100;
6844                 reslen += rescnt;
6845                 if (_PyUnicode_Resize(&result, reslen) < 0)
6846                     return NULL;
6847                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6848                 --rescnt;
6849             }
6850             *res++ = *fmt++;
6851         }
6852         else {
6853             /* Got a format specifier */
6854             int flags = 0;
6855             int width = -1;
6856             int prec = -1;
6857             Py_UNICODE c = '\0';
6858             Py_UNICODE fill;
6859             PyObject *v = NULL;
6860             PyObject *temp = NULL;
6861             Py_UNICODE *pbuf;
6862             Py_UNICODE sign;
6863             int len;
6864             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6865
6866             fmt++;
6867             if (*fmt == '(') {
6868                 Py_UNICODE *keystart;
6869                 int keylen;
6870                 PyObject *key;
6871                 int pcount = 1;
6872
6873                 if (dict == NULL) {
6874                     PyErr_SetString(PyExc_TypeError,
6875                                     "format requires a mapping");
6876                     goto onError;
6877                 }
6878                 ++fmt;
6879                 --fmtcnt;
6880                 keystart = fmt;
6881                 /* Skip over balanced parentheses */
6882                 while (pcount > 0 && --fmtcnt >= 0) {
6883                     if (*fmt == ')')
6884                         --pcount;
6885                     else if (*fmt == '(')
6886                         ++pcount;
6887                     fmt++;
6888                 }
6889                 keylen = fmt - keystart - 1;
6890                 if (fmtcnt < 0 || pcount > 0) {
6891                     PyErr_SetString(PyExc_ValueError,
6892                                     "incomplete format key");
6893                     goto onError;
6894                 }
6895 #if 0
6896                 /* keys are converted to strings using UTF-8 and
6897                    then looked up since Python uses strings to hold
6898                    variables names etc. in its namespaces and we
6899                    wouldn't want to break common idioms. */
6900                 key = PyUnicode_EncodeUTF8(keystart,
6901                                            keylen,
6902                                            NULL);
6903 #else
6904                 key = PyUnicode_FromUnicode(keystart, keylen);
6905 #endif
6906                 if (key == NULL)
6907                     goto onError;
6908                 if (args_owned) {
6909                     Py_DECREF(args);
6910                     args_owned = 0;
6911                 }
6912                 args = PyObject_GetItem(dict, key);
6913                 Py_DECREF(key);
6914                 if (args == NULL) {
6915                     goto onError;
6916                 }
6917                 args_owned = 1;
6918                 arglen = -1;
6919                 argidx = -2;
6920             }
6921             while (--fmtcnt >= 0) {
6922                 switch (c = *fmt++) {
6923                 case '-': flags |= F_LJUST; continue;
6924                 case '+': flags |= F_SIGN; continue;
6925                 case ' ': flags |= F_BLANK; continue;
6926                 case '#': flags |= F_ALT; continue;
6927                 case '0': flags |= F_ZERO; continue;
6928                 }
6929                 break;
6930             }
6931             if (c == '*') {
6932                 v = getnextarg(args, arglen, &argidx);
6933                 if (v == NULL)
6934                     goto onError;
6935                 if (!PyInt_Check(v)) {
6936                     PyErr_SetString(PyExc_TypeError,
6937                                     "* wants int");
6938                     goto onError;
6939                 }
6940                 width = PyInt_AsLong(v);
6941                 if (width < 0) {
6942                     flags |= F_LJUST;
6943                     width = -width;
6944                 }
6945                 if (--fmtcnt >= 0)
6946                     c = *fmt++;
6947             }
6948             else if (c >= '0' && c <= '9') {
6949                 width = c - '0';
6950                 while (--fmtcnt >= 0) {
6951                     c = *fmt++;
6952                     if (c < '0' || c > '9')
6953                         break;
6954                     if ((width*10) / 10 != width) {
6955                         PyErr_SetString(PyExc_ValueError,
6956                                         "width too big");
6957                         goto onError;
6958                     }
6959                     width = width*10 + (c - '0');
6960                 }
6961             }
6962             if (c == '.') {
6963                 prec = 0;
6964                 if (--fmtcnt >= 0)
6965                     c = *fmt++;
6966                 if (c == '*') {
6967                     v = getnextarg(args, arglen, &argidx);
6968                     if (v == NULL)
6969                         goto onError;
6970                     if (!PyInt_Check(v)) {
6971                         PyErr_SetString(PyExc_TypeError,
6972                                         "* wants int");
6973                         goto onError;
6974                     }
6975                     prec = PyInt_AsLong(v);
6976                     if (prec < 0)
6977                         prec = 0;
6978                     if (--fmtcnt >= 0)
6979                         c = *fmt++;
6980                 }
6981                 else if (c >= '0' && c <= '9') {
6982                     prec = c - '0';
6983                     while (--fmtcnt >= 0) {
6984                         c = Py_CHARMASK(*fmt++);
6985                         if (c < '0' || c > '9')
6986                             break;
6987                         if ((prec*10) / 10 != prec) {
6988                             PyErr_SetString(PyExc_ValueError,
6989                                             "prec too big");
6990                             goto onError;
6991                         }
6992                         prec = prec*10 + (c - '0');
6993                     }
6994                 }
6995             } /* prec */
6996             if (fmtcnt >= 0) {
6997                 if (c == 'h' || c == 'l' || c == 'L') {
6998                     if (--fmtcnt >= 0)
6999                         c = *fmt++;
7000                 }
7001             }
7002             if (fmtcnt < 0) {
7003                 PyErr_SetString(PyExc_ValueError,
7004                                 "incomplete format");
7005                 goto onError;
7006             }
7007             if (c != '%') {
7008                 v = getnextarg(args, arglen, &argidx);
7009                 if (v == NULL)
7010                     goto onError;
7011             }
7012             sign = 0;
7013             fill = ' ';
7014             switch (c) {
7015
7016             case '%':
7017                 pbuf = formatbuf;
7018                 /* presume that buffer length is at least 1 */
7019                 pbuf[0] = '%';
7020                 len = 1;
7021                 break;
7022
7023             case 's':
7024             case 'r':
7025                 if (PyUnicode_Check(v) && c == 's') {
7026                     temp = v;
7027                     Py_INCREF(temp);
7028                 }
7029                 else {
7030                     PyObject *unicode;
7031                     if (c == 's')
7032                         temp = PyObject_Unicode(v);
7033                     else
7034                         temp = PyObject_Repr(v);
7035                     if (temp == NULL)
7036                         goto onError;
7037                     if (PyUnicode_Check(temp))
7038                         /* nothing to do */;
7039                     else if (PyString_Check(temp)) {
7040                         /* convert to string to Unicode */
7041                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7042                                                    PyString_GET_SIZE(temp),
7043                                                NULL,
7044                                                    "strict");
7045                     Py_DECREF(temp);
7046                     temp = unicode;
7047                     if (temp == NULL)
7048                         goto onError;
7049                 }
7050                     else {
7051                         Py_DECREF(temp);
7052                         PyErr_SetString(PyExc_TypeError,
7053                                         "%s argument has non-string str()");
7054                         goto onError;
7055                     }
7056                 }
7057                 pbuf = PyUnicode_AS_UNICODE(temp);
7058                 len = PyUnicode_GET_SIZE(temp);
7059                 if (prec >= 0 && len > prec)
7060                     len = prec;
7061                 break;
7062
7063             case 'i':
7064             case 'd':
7065             case 'u':
7066             case 'o':
7067             case 'x':
7068             case 'X':
7069                 if (c == 'i')
7070                     c = 'd';
7071                 if (PyLong_Check(v)) {
7072                     temp = formatlong(v, flags, prec, c);
7073                     if (!temp)
7074                         goto onError;
7075                     pbuf = PyUnicode_AS_UNICODE(temp);
7076                     len = PyUnicode_GET_SIZE(temp);
7077                     sign = 1;
7078                 }
7079                 else {
7080                     pbuf = formatbuf;
7081                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7082                                     flags, prec, c, v);
7083                     if (len < 0)
7084                         goto onError;
7085                     sign = 1;
7086                 }
7087                 if (flags & F_ZERO)
7088                     fill = '0';
7089                 break;
7090
7091             case 'e':
7092             case 'E':
7093             case 'f':
7094             case 'F':
7095             case 'g':
7096             case 'G':
7097                 if (c == 'F')
7098                         c = 'f';
7099                 pbuf = formatbuf;
7100                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7101                         flags, prec, c, v);
7102                 if (len < 0)
7103                     goto onError;
7104                 sign = 1;
7105                 if (flags & F_ZERO)
7106                     fill = '0';
7107                 break;
7108
7109             case 'c':
7110                 pbuf = formatbuf;
7111                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7112                 if (len < 0)
7113                     goto onError;
7114                 break;
7115
7116             default:
7117                 PyErr_Format(PyExc_ValueError,
7118                              "unsupported format character '%c' (0x%x) "
7119                              "at index %i",
7120                              (31<=c && c<=126) ? (char)c : '?',
7121                              (int)c,
7122                              (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7123                 goto onError;
7124             }
7125             if (sign) {
7126                 if (*pbuf == '-' || *pbuf == '+') {
7127                     sign = *pbuf++;
7128                     len--;
7129                 }
7130                 else if (flags & F_SIGN)
7131                     sign = '+';
7132                 else if (flags & F_BLANK)
7133                     sign = ' ';
7134                 else
7135                     sign = 0;
7136             }
7137             if (width < len)
7138                 width = len;
7139             if (rescnt - (sign != 0) < width) {
7140                 reslen -= rescnt;
7141                 rescnt = width + fmtcnt + 100;
7142                 reslen += rescnt;
7143                 if (reslen < 0) {
7144                     Py_DECREF(result);
7145                     return PyErr_NoMemory();
7146                 }
7147                 if (_PyUnicode_Resize(&result, reslen) < 0)
7148                     return NULL;
7149                 res = PyUnicode_AS_UNICODE(result)
7150                     + reslen - rescnt;
7151             }
7152             if (sign) {
7153                 if (fill != ' ')
7154                     *res++ = sign;
7155                 rescnt--;
7156                 if (width > len)
7157                     width--;
7158             }
7159             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7160                 assert(pbuf[0] == '0');
7161                 assert(pbuf[1] == c);
7162                 if (fill != ' ') {
7163                     *res++ = *pbuf++;
7164                     *res++ = *pbuf++;
7165                 }
7166                 rescnt -= 2;
7167                 width -= 2;
7168                 if (width < 0)
7169                     width = 0;
7170                 len -= 2;
7171             }
7172             if (width > len && !(flags & F_LJUST)) {
7173                 do {
7174                     --rescnt;
7175                     *res++ = fill;
7176                 } while (--width > len);
7177             }
7178             if (fill == ' ') {
7179                 if (sign)
7180                     *res++ = sign;
7181                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7182                     assert(pbuf[0] == '0');
7183                     assert(pbuf[1] == c);
7184                     *res++ = *pbuf++;
7185                     *res++ = *pbuf++;
7186                 }
7187             }
7188             Py_UNICODE_COPY(res, pbuf, len);
7189             res += len;
7190             rescnt -= len;
7191             while (--width >= len) {
7192                 --rescnt;
7193                 *res++ = ' ';
7194             }
7195             if (dict && (argidx < arglen) && c != '%') {
7196                 PyErr_SetString(PyExc_TypeError,
7197                                 "not all arguments converted during string formatting");
7198                 goto onError;
7199             }
7200             Py_XDECREF(temp);
7201         } /* '%' */
7202     } /* until end */
7203     if (argidx < arglen && !dict) {
7204         PyErr_SetString(PyExc_TypeError,
7205                         "not all arguments converted during string formatting");
7206         goto onError;
7207     }
7208
7209     if (args_owned) {
7210         Py_DECREF(args);
7211     }
7212     Py_DECREF(uformat);
7213     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7214         goto onError;
7215     return (PyObject *)result;
7216
7217  onError:
7218     Py_XDECREF(result);
7219     Py_DECREF(uformat);
7220     if (args_owned) {
7221         Py_DECREF(args);
7222     }
7223     return NULL;
7224 }
7225
7226 static PyBufferProcs unicode_as_buffer = {
7227     (getreadbufferproc) unicode_buffer_getreadbuf,
7228     (getwritebufferproc) unicode_buffer_getwritebuf,
7229     (getsegcountproc) unicode_buffer_getsegcount,
7230     (getcharbufferproc) unicode_buffer_getcharbuf,
7231 };
7232
7233 static PyObject *
7234 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7235
7236 static PyObject *
7237 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7238 {
7239         PyObject *x = NULL;
7240         static char *kwlist[] = {"string", "encoding", "errors", 0};
7241         char *encoding = NULL;
7242         char *errors = NULL;
7243
7244         if (type != &PyUnicode_Type)
7245                 return unicode_subtype_new(type, args, kwds);
7246         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7247                                           kwlist, &x, &encoding, &errors))
7248             return NULL;
7249         if (x == NULL)
7250                 return (PyObject *)_PyUnicode_New(0);
7251         if (encoding == NULL && errors == NULL)
7252             return PyObject_Unicode(x);
7253         else
7254         return PyUnicode_FromEncodedObject(x, encoding, errors);
7255 }
7256
7257 static PyObject *
7258 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7259 {
7260         PyUnicodeObject *tmp, *pnew;
7261         int n;
7262
7263         assert(PyType_IsSubtype(type, &PyUnicode_Type));
7264         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7265         if (tmp == NULL)
7266                 return NULL;
7267         assert(PyUnicode_Check(tmp));
7268         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7269         if (pnew == NULL) {
7270                 Py_DECREF(tmp);
7271                 return NULL;
7272         }
7273         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7274         if (pnew->str == NULL) {
7275                 _Py_ForgetReference((PyObject *)pnew);
7276                 PyObject_Del(pnew);
7277                 Py_DECREF(tmp);
7278                 return PyErr_NoMemory();
7279         }
7280         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7281         pnew->length = n;
7282         pnew->hash = tmp->hash;
7283         Py_DECREF(tmp);
7284         return (PyObject *)pnew;
7285 }
7286
7287 PyDoc_STRVAR(unicode_doc,
7288 "unicode(string [, encoding[, errors]]) -> object\n\
7289 \n\
7290 Create a new Unicode object from the given encoded string.\n\
7291 encoding defaults to the current default string encoding.\n\
7292 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7293
7294 PyTypeObject PyUnicode_Type = {
7295     PyObject_HEAD_INIT(&PyType_Type)
7296     0,                                  /* ob_size */
7297     "unicode",                          /* tp_name */
7298     sizeof(PyUnicodeObject),            /* tp_size */
7299     0,                                  /* tp_itemsize */
7300     /* Slots */
7301     (destructor)unicode_dealloc,        /* tp_dealloc */
7302     0,                                  /* tp_print */
7303     0,                                  /* tp_getattr */
7304     0,                                  /* tp_setattr */
7305     (cmpfunc) unicode_compare,          /* tp_compare */
7306     (reprfunc) unicode_repr,            /* tp_repr */
7307     &unicode_as_number,                 /* tp_as_number */
7308     &unicode_as_sequence,               /* tp_as_sequence */
7309     &unicode_as_mapping,                /* tp_as_mapping */
7310     (hashfunc) unicode_hash,            /* tp_hash*/
7311     0,                                  /* tp_call*/
7312     (reprfunc) unicode_str,             /* tp_str */
7313     PyObject_GenericGetAttr,            /* tp_getattro */
7314     0,                                  /* tp_setattro */
7315     &unicode_as_buffer,                 /* tp_as_buffer */
7316     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7317             Py_TPFLAGS_BASETYPE,        /* tp_flags */
7318     unicode_doc,                        /* tp_doc */
7319     0,                                  /* tp_traverse */
7320     0,                                  /* tp_clear */
7321     0,                                  /* tp_richcompare */
7322     0,                                  /* tp_weaklistoffset */
7323     0,                                  /* tp_iter */
7324     0,                                  /* tp_iternext */
7325     unicode_methods,                    /* tp_methods */
7326     0,                                  /* tp_members */
7327     0,                                  /* tp_getset */
7328     &PyBaseString_Type,                 /* tp_base */
7329     0,                                  /* tp_dict */
7330     0,                                  /* tp_descr_get */
7331     0,                                  /* tp_descr_set */
7332     0,                                  /* tp_dictoffset */
7333     0,                                  /* tp_init */
7334     0,                                  /* tp_alloc */
7335     unicode_new,                        /* tp_new */
7336     PyObject_Del,               /* tp_free */
7337 };
7338
7339 /* Initialize the Unicode implementation */
7340
7341 void _PyUnicode_Init(void)
7342 {
7343     int i;
7344
7345     /* Init the implementation */
7346     unicode_freelist = NULL;
7347     unicode_freelist_size = 0;
7348     unicode_empty = _PyUnicode_New(0);
7349     strcpy(unicode_default_encoding, "ascii");
7350     for (i = 0; i < 256; i++)
7351         unicode_latin1[i] = NULL;
7352     if (PyType_Ready(&PyUnicode_Type) < 0)
7353         Py_FatalError("Can't initialize 'unicode'");
7354 }
7355
7356 /* Finalize the Unicode implementation */
7357
7358 void
7359 _PyUnicode_Fini(void)
7360 {
7361     PyUnicodeObject *u;
7362     int i;
7363
7364     Py_XDECREF(unicode_empty);
7365     unicode_empty = NULL;
7366
7367     for (i = 0; i < 256; i++) {
7368         if (unicode_latin1[i]) {
7369             Py_DECREF(unicode_latin1[i]);
7370             unicode_latin1[i] = NULL;
7371         }
7372     }
7373
7374     for (u = unicode_freelist; u != NULL;) {
7375         PyUnicodeObject *v = u;
7376         u = *(PyUnicodeObject **)u;
7377         if (v->str)
7378             PyMem_DEL(v->str);
7379         Py_XDECREF(v->defenc);
7380         PyObject_Del(v);
7381     }
7382     unicode_freelist = NULL;
7383     unicode_freelist_size = 0;
7384 }
7385
7386 /*
7387 Local variables:
7388 c-basic-offset: 4
7389 indent-tabs-mode: nil
7390 End:
7391 */