Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44 #include "bytes_methods.h"
  45
  46 #include "unicodeobject.h"
  47 #include "ucnhash.h"
  48
  49 #ifdef MS_WINDOWS
  50 #include <windows.h>
  51 #endif
  52
  53 /* Limit for the Unicode object free list */
  54
  55 #define PyUnicode_MAXFREELIST       1024
  56
  57 /* Limit for the Unicode object free list stay alive optimization.
  58
  59    The implementation will keep allocated Unicode memory intact for
  60    all objects on the free list having a size less than this
  61    limit. This reduces malloc() overhead for small Unicode objects.
  62
  63    At worst this will result in PyUnicode_MAXFREELIST *
  64    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  65    malloc()-overhead) bytes of unused garbage.
  66
  67    Setting the limit to 0 effectively turns the feature off.
  68
  69    Note: This is an experimental feature ! If you get core dumps when
  70    using Unicode objects, turn this feature off.
  71
  72 */
  73
  74 #define KEEPALIVE_SIZE_LIMIT       9
  75
  76 /* Endianness switches; defaults to little endian */
  77
  78 #ifdef WORDS_BIGENDIAN
  79 # define BYTEORDER_IS_BIG_ENDIAN
  80 #else
  81 # define BYTEORDER_IS_LITTLE_ENDIAN
  82 #endif
  83
  84 /* --- Globals ------------------------------------------------------------
  85
  86    The globals are initialized by the _PyUnicode_Init() API and should
  87    not be used before calling that API.
  88
  89 */
  90
  91
  92 #ifdef __cplusplus
  93 extern "C" {
  94 #endif
  95
  96 /* This dictionary holds all interned unicode strings.  Note that references
  97    to strings in this dictionary are *not* counted in the string's ob_refcnt.
  98    When the interned string reaches a refcnt of 0 the string deallocation
  99    function will delete the reference from this dictionary.
 100
 101    Another way to look at this is that to say that the actual reference
 102    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
 103 */
 104 static PyObject *interned;
 105
 106 /* Free list for Unicode objects */
 107 static PyUnicodeObject *free_list;
 108 static int numfree;
 109
 110 /* The empty Unicode object is shared to improve performance. */
 111 static PyUnicodeObject *unicode_empty;
 112
 113 /* Single character Unicode strings in the Latin-1 range are being
 114    shared as well. */
 115 static PyUnicodeObject *unicode_latin1[256];
 116
 117 /* Default encoding to use and assume when NULL is passed as encoding
 118    parameter; it is fixed to "utf-8".  Always use the
 119    PyUnicode_GetDefaultEncoding() API to access this global.
 120
 121    Don't forget to alter Py_FileSystemDefaultEncoding if you change the
 122    hard coded default!
 123 */
 124 static const char unicode_default_encoding[] = "utf-8";
 125
 126 /* Fast detection of the most frequent whitespace characters */
 127 const unsigned char _Py_ascii_whitespace[] = {
 128     0, 0, 0, 0, 0, 0, 0, 0,
 129 /*     case 0x0009: * HORIZONTAL TABULATION */
 130 /*     case 0x000A: * LINE FEED */
 131 /*     case 0x000B: * VERTICAL TABULATION */
 132 /*     case 0x000C: * FORM FEED */
 133 /*     case 0x000D: * CARRIAGE RETURN */
 134     0, 1, 1, 1, 1, 1, 0, 0,
 135     0, 0, 0, 0, 0, 0, 0, 0,
 136 /*     case 0x001C: * FILE SEPARATOR */
 137 /*     case 0x001D: * GROUP SEPARATOR */
 138 /*     case 0x001E: * RECORD SEPARATOR */
 139 /*     case 0x001F: * UNIT SEPARATOR */
 140     0, 0, 0, 0, 1, 1, 1, 1,
 141 /*     case 0x0020: * SPACE */
 142     1, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0,
 144     0, 0, 0, 0, 0, 0, 0, 0,
 145     0, 0, 0, 0, 0, 0, 0, 0,
 146
 147     0, 0, 0, 0, 0, 0, 0, 0,
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149     0, 0, 0, 0, 0, 0, 0, 0,
 150     0, 0, 0, 0, 0, 0, 0, 0,
 151     0, 0, 0, 0, 0, 0, 0, 0,
 152     0, 0, 0, 0, 0, 0, 0, 0,
 153     0, 0, 0, 0, 0, 0, 0, 0,
 154     0, 0, 0, 0, 0, 0, 0, 0
 155 };
 156
 157 static PyObject *unicode_encode_call_errorhandler(const char *errors,
 158        PyObject **errorHandler,const char *encoding, const char *reason,
 159        const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
 160        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
 161
 162 static void raise_encode_exception(PyObject **exceptionObject,
 163                                    const char *encoding,
 164                                    const Py_UNICODE *unicode, Py_ssize_t size,
 165                                    Py_ssize_t startpos, Py_ssize_t endpos,
 166                                    const char *reason);
 167
 168 /* Same for linebreaks */
 169 static unsigned char ascii_linebreak[] = {
 170     0, 0, 0, 0, 0, 0, 0, 0,
 171 /*         0x000A, * LINE FEED */
 172 /*         0x000D, * CARRIAGE RETURN */
 173     0, 0, 1, 0, 0, 1, 0, 0,
 174     0, 0, 0, 0, 0, 0, 0, 0,
 175 /*         0x001C, * FILE SEPARATOR */
 176 /*         0x001D, * GROUP SEPARATOR */
 177 /*         0x001E, * RECORD SEPARATOR */
 178     0, 0, 0, 0, 1, 1, 1, 0,
 179     0, 0, 0, 0, 0, 0, 0, 0,
 180     0, 0, 0, 0, 0, 0, 0, 0,
 181     0, 0, 0, 0, 0, 0, 0, 0,
 182     0, 0, 0, 0, 0, 0, 0, 0,
 183
 184     0, 0, 0, 0, 0, 0, 0, 0,
 185     0, 0, 0, 0, 0, 0, 0, 0,
 186     0, 0, 0, 0, 0, 0, 0, 0,
 187     0, 0, 0, 0, 0, 0, 0, 0,
 188     0, 0, 0, 0, 0, 0, 0, 0,
 189     0, 0, 0, 0, 0, 0, 0, 0,
 190     0, 0, 0, 0, 0, 0, 0, 0,
 191     0, 0, 0, 0, 0, 0, 0, 0
 192 };
 193
 194
 195 Py_UNICODE
 196 PyUnicode_GetMax(void)
 197 {
 198 #ifdef Py_UNICODE_WIDE
 199     return 0x10FFFF;
 200 #else
 201     /* This is actually an illegal character, so it should
 202        not be passed to unichr. */
 203     return 0xFFFF;
 204 #endif
 205 }
 206
 207 /* --- Bloom Filters ----------------------------------------------------- */
 208
 209 /* stuff to implement simple "bloom filters" for Unicode characters.
 210    to keep things simple, we use a single bitmask, using the least 5
 211    bits from each unicode characters as the bit index. */
 212
 213 /* the linebreak mask is set up by Unicode_Init below */
 214
 215 #define BLOOM_MASK unsigned long
 216
 217 static BLOOM_MASK bloom_linebreak;
 218
 219 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 220
 221 #define BLOOM_LINEBREAK(ch)                                             \
 222     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 223      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 224
 225 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 226 {
 227     /* calculate simple bloom-style bitmask for a given unicode string */
 228
 229     long mask;
 230     Py_ssize_t i;
 231
 232     mask = 0;
 233     for (i = 0; i < len; i++)
 234         mask |= (1 << (ptr[i] & 0x1F));
 235
 236     return mask;
 237 }
 238
 239 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 240 {
 241     Py_ssize_t i;
 242
 243     for (i = 0; i < setlen; i++)
 244         if (set[i] == chr)
 245             return 1;
 246
 247     return 0;
 248 }
 249
 250 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 251     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 252
 253 /* --- Unicode Object ----------------------------------------------------- */
 254
 255 static
 256 int unicode_resize(register PyUnicodeObject *unicode,
 257                    Py_ssize_t length)
 258 {
 259     void *oldstr;
 260
 261     /* Shortcut if there's nothing much to do. */
 262     if (unicode->length == length)
 263         goto reset;
 264
 265     /* Resizing shared object (unicode_empty or single character
 266        objects) in-place is not allowed. Use PyUnicode_Resize()
 267        instead ! */
 268
 269     if (unicode == unicode_empty ||
 270         (unicode->length == 1 &&
 271          unicode->str[0] < 256U &&
 272          unicode_latin1[unicode->str[0]] == unicode)) {
 273         PyErr_SetString(PyExc_SystemError,
 274                         "can't resize shared str objects");
 275         return -1;
 276     }
 277
 278     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 279        The overallocation is also used by fastsearch, which assumes that it's
 280        safe to look at str[length] (without making any assumptions about what
 281        it contains). */
 282
 283     oldstr = unicode->str;
 284     unicode->str = PyObject_REALLOC(unicode->str,
 285                                     sizeof(Py_UNICODE) * (length + 1));
 286     if (!unicode->str) {
 287         unicode->str = (Py_UNICODE *)oldstr;
 288         PyErr_NoMemory();
 289         return -1;
 290     }
 291     unicode->str[length] = 0;
 292     unicode->length = length;
 293
 294   reset:
 295     /* Reset the object caches */
 296     if (unicode->defenc) {
 297         Py_CLEAR(unicode->defenc);
 298     }
 299     unicode->hash = -1;
 300
 301     return 0;
 302 }
 303
 304 /* We allocate one more byte to make sure the string is
 305    Ux0000 terminated; some code (e.g. new_identifier)
 306    relies on that.
 307
 308    XXX This allocator could further be enhanced by assuring that the
 309    free list never reduces its size below 1.
 310
 311 */
 312
 313 static
 314 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 315 {
 316     register PyUnicodeObject *unicode;
 317
 318     /* Optimization for empty strings */
 319     if (length == 0 && unicode_empty != NULL) {
 320         Py_INCREF(unicode_empty);
 321         return unicode_empty;
 322     }
 323
 324     /* Ensure we won't overflow the size. */
 325     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 326         return (PyUnicodeObject *)PyErr_NoMemory();
 327     }
 328
 329     /* Unicode freelist & memory allocation */
 330     if (free_list) {
 331         unicode = free_list;
 332         free_list = *(PyUnicodeObject **)unicode;
 333         numfree--;
 334         if (unicode->str) {
 335             /* Keep-Alive optimization: we only upsize the buffer,
 336                never downsize it. */
 337             if ((unicode->length < length) &&
 338                 unicode_resize(unicode, length) < 0) {
 339                 PyObject_DEL(unicode->str);
 340                 unicode->str = NULL;
 341             }
 342         }
 343         else {
 344             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 345             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 346         }
 347         PyObject_INIT(unicode, &PyUnicode_Type);
 348     }
 349     else {
 350         size_t new_size;
 351         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 352         if (unicode == NULL)
 353             return NULL;
 354         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 355         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 356     }
 357
 358     if (!unicode->str) {
 359         PyErr_NoMemory();
 360         goto onError;
 361     }
 362     /* Initialize the first element to guard against cases where
 363      * the caller fails before initializing str -- unicode_resize()
 364      * reads str[0], and the Keep-Alive optimization can keep memory
 365      * allocated for str alive across a call to unicode_dealloc(unicode).
 366      * We don't want unicode_resize to read uninitialized memory in
 367      * that case.
 368      */
 369     unicode->str[0] = 0;
 370     unicode->str[length] = 0;
 371     unicode->length = length;
 372     unicode->hash = -1;
 373     unicode->state = 0;
 374     unicode->defenc = NULL;
 375     return unicode;
 376
 377   onError:
 378     /* XXX UNREF/NEWREF interface should be more symmetrical */
 379     _Py_DEC_REFTOTAL;
 380     _Py_ForgetReference((PyObject *)unicode);
 381     PyObject_Del(unicode);
 382     return NULL;
 383 }
 384
 385 static
 386 void unicode_dealloc(register PyUnicodeObject *unicode)
 387 {
 388     switch (PyUnicode_CHECK_INTERNED(unicode)) {
 389     case SSTATE_NOT_INTERNED:
 390         break;
 391
 392     case SSTATE_INTERNED_MORTAL:
 393         /* revive dead object temporarily for DelItem */
 394         Py_REFCNT(unicode) = 3;
 395         if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
 396             Py_FatalError(
 397                 "deletion of interned string failed");
 398         break;
 399
 400     case SSTATE_INTERNED_IMMORTAL:
 401         Py_FatalError("Immortal interned string died.");
 402
 403     default:
 404         Py_FatalError("Inconsistent interned string state.");
 405     }
 406
 407     if (PyUnicode_CheckExact(unicode) &&
 408         numfree < PyUnicode_MAXFREELIST) {
 409         /* Keep-Alive optimization */
 410         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 411             PyObject_DEL(unicode->str);
 412             unicode->str = NULL;
 413             unicode->length = 0;
 414         }
 415         if (unicode->defenc) {
 416             Py_CLEAR(unicode->defenc);
 417         }
 418         /* Add to free list */
 419         *(PyUnicodeObject **)unicode = free_list;
 420         free_list = unicode;
 421         numfree++;
 422     }
 423     else {
 424         PyObject_DEL(unicode->str);
 425         Py_XDECREF(unicode->defenc);
 426         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 427     }
 428 }
 429
 430 static
 431 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 432 {
 433     register PyUnicodeObject *v;
 434
 435     /* Argument checks */
 436     if (unicode == NULL) {
 437         PyErr_BadInternalCall();
 438         return -1;
 439     }
 440     v = *unicode;
 441     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 442         PyErr_BadInternalCall();
 443         return -1;
 444     }
 445
 446     /* Resizing unicode_empty and single character objects is not
 447        possible since these are being shared. We simply return a fresh
 448        copy with the same Unicode content. */
 449     if (v->length != length &&
 450         (v == unicode_empty || v->length == 1)) {
 451         PyUnicodeObject *w = _PyUnicode_New(length);
 452         if (w == NULL)
 453             return -1;
 454         Py_UNICODE_COPY(w->str, v->str,
 455                         length < v->length ? length : v->length);
 456         Py_DECREF(*unicode);
 457         *unicode = w;
 458         return 0;
 459     }
 460
 461     /* Note that we don't have to modify *unicode for unshared Unicode
 462        objects, since we can modify them in-place. */
 463     return unicode_resize(v, length);
 464 }
 465
 466 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 467 {
 468     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 469 }
 470
 471 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 472                                 Py_ssize_t size)
 473 {
 474     PyUnicodeObject *unicode;
 475
 476     /* If the Unicode data is known at construction time, we can apply
 477        some optimizations which share commonly used objects. */
 478     if (u != NULL) {
 479
 480         /* Optimization for empty strings */
 481         if (size == 0 && unicode_empty != NULL) {
 482             Py_INCREF(unicode_empty);
 483             return (PyObject *)unicode_empty;
 484         }
 485
 486         /* Single character Unicode objects in the Latin-1 range are
 487            shared when using this constructor */
 488         if (size == 1 && *u < 256) {
 489             unicode = unicode_latin1[*u];
 490             if (!unicode) {
 491                 unicode = _PyUnicode_New(1);
 492                 if (!unicode)
 493                     return NULL;
 494                 unicode->str[0] = *u;
 495                 unicode_latin1[*u] = unicode;
 496             }
 497             Py_INCREF(unicode);
 498             return (PyObject *)unicode;
 499         }
 500     }
 501
 502     unicode = _PyUnicode_New(size);
 503     if (!unicode)
 504         return NULL;
 505
 506     /* Copy the Unicode data into the new object */
 507     if (u != NULL)
 508         Py_UNICODE_COPY(unicode->str, u, size);
 509
 510     return (PyObject *)unicode;
 511 }
 512
 513 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 514 {
 515     PyUnicodeObject *unicode;
 516
 517     if (size < 0) {
 518         PyErr_SetString(PyExc_SystemError,
 519                         "Negative size passed to PyUnicode_FromStringAndSize");
 520         return NULL;
 521     }
 522
 523     /* If the Unicode data is known at construction time, we can apply
 524        some optimizations which share commonly used objects.
 525        Also, this means the input must be UTF-8, so fall back to the
 526        UTF-8 decoder at the end. */
 527     if (u != NULL) {
 528
 529         /* Optimization for empty strings */
 530         if (size == 0 && unicode_empty != NULL) {
 531             Py_INCREF(unicode_empty);
 532             return (PyObject *)unicode_empty;
 533         }
 534
 535         /* Single characters are shared when using this constructor.
 536            Restrict to ASCII, since the input must be UTF-8. */
 537         if (size == 1 && Py_CHARMASK(*u) < 128) {
 538             unicode = unicode_latin1[Py_CHARMASK(*u)];
 539             if (!unicode) {
 540                 unicode = _PyUnicode_New(1);
 541                 if (!unicode)
 542                     return NULL;
 543                 unicode->str[0] = Py_CHARMASK(*u);
 544                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 545             }
 546             Py_INCREF(unicode);
 547             return (PyObject *)unicode;
 548         }
 549
 550         return PyUnicode_DecodeUTF8(u, size, NULL);
 551     }
 552
 553     unicode = _PyUnicode_New(size);
 554     if (!unicode)
 555         return NULL;
 556
 557     return (PyObject *)unicode;
 558 }
 559
 560 PyObject *PyUnicode_FromString(const char *u)
 561 {
 562     size_t size = strlen(u);
 563     if (size > PY_SSIZE_T_MAX) {
 564         PyErr_SetString(PyExc_OverflowError, "input too long");
 565         return NULL;
 566     }
 567
 568     return PyUnicode_FromStringAndSize(u, size);
 569 }
 570
 571 #ifdef HAVE_WCHAR_H
 572
 573 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 574 # define CONVERT_WCHAR_TO_SURROGATES
 575 #endif
 576
 577 #ifdef CONVERT_WCHAR_TO_SURROGATES
 578
 579 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 580    to convert from UTF32 to UTF16. */
 581
 582 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 583                                  Py_ssize_t size)
 584 {
 585     PyUnicodeObject *unicode;
 586     register Py_ssize_t i;
 587     Py_ssize_t alloc;
 588     const wchar_t *orig_w;
 589
 590     if (w == NULL) {
 591         if (size == 0)
 592             return PyUnicode_FromStringAndSize(NULL, 0);
 593         PyErr_BadInternalCall();
 594         return NULL;
 595     }
 596
 597     if (size == -1) {
 598         size = wcslen(w);
 599     }
 600
 601     alloc = size;
 602     orig_w = w;
 603     for (i = size; i > 0; i--) {
 604         if (*w > 0xFFFF)
 605             alloc++;
 606         w++;
 607     }
 608     w = orig_w;
 609     unicode = _PyUnicode_New(alloc);
 610     if (!unicode)
 611         return NULL;
 612
 613     /* Copy the wchar_t data into the new object */
 614     {
 615         register Py_UNICODE *u;
 616         u = PyUnicode_AS_UNICODE(unicode);
 617         for (i = size; i > 0; i--) {
 618             if (*w > 0xFFFF) {
 619                 wchar_t ordinal = *w++;
 620                 ordinal -= 0x10000;
 621                 *u++ = 0xD800 | (ordinal >> 10);
 622                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 623             }
 624             else
 625                 *u++ = *w++;
 626         }
 627     }
 628     return (PyObject *)unicode;
 629 }
 630
 631 #else
 632
 633 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 634                                  Py_ssize_t size)
 635 {
 636     PyUnicodeObject *unicode;
 637
 638     if (w == NULL) {
 639         if (size == 0)
 640             return PyUnicode_FromStringAndSize(NULL, 0);
 641         PyErr_BadInternalCall();
 642         return NULL;
 643     }
 644
 645     if (size == -1) {
 646         size = wcslen(w);
 647     }
 648
 649     unicode = _PyUnicode_New(size);
 650     if (!unicode)
 651         return NULL;
 652
 653     /* Copy the wchar_t data into the new object */
 654 #ifdef HAVE_USABLE_WCHAR_T
 655     memcpy(unicode->str, w, size * sizeof(wchar_t));
 656 #else
 657     {
 658         register Py_UNICODE *u;
 659         register Py_ssize_t i;
 660         u = PyUnicode_AS_UNICODE(unicode);
 661         for (i = size; i > 0; i--)
 662             *u++ = *w++;
 663     }
 664 #endif
 665
 666     return (PyObject *)unicode;
 667 }
 668
 669 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 670
 671 #undef CONVERT_WCHAR_TO_SURROGATES
 672
 673 static void
 674 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 675 {
 676     *fmt++ = '%';
 677     if (width) {
 678         if (zeropad)
 679             *fmt++ = '0';
 680         fmt += sprintf(fmt, "%d", width);
 681     }
 682     if (precision)
 683         fmt += sprintf(fmt, ".%d", precision);
 684     if (longflag)
 685         *fmt++ = 'l';
 686     else if (size_tflag) {
 687         char *f = PY_FORMAT_SIZE_T;
 688         while (*f)
 689             *fmt++ = *f++;
 690     }
 691     *fmt++ = c;
 692     *fmt = '\0';
 693 }
 694
 695 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 696
 697 PyObject *
 698 PyUnicode_FromFormatV(const char *format, va_list vargs)
 699 {
 700     va_list count;
 701     Py_ssize_t callcount = 0;
 702     PyObject **callresults = NULL;
 703     PyObject **callresult = NULL;
 704     Py_ssize_t n = 0;
 705     int width = 0;
 706     int precision = 0;
 707     int zeropad;
 708     const char* f;
 709     Py_UNICODE *s;
 710     PyObject *string;
 711     /* used by sprintf */
 712     char buffer[21];
 713     /* use abuffer instead of buffer, if we need more space
 714      * (which can happen if there's a format specifier with width). */
 715     char *abuffer = NULL;
 716     char *realbuffer;
 717     Py_ssize_t abuffersize = 0;
 718     char fmt[60]; /* should be enough for %0width.precisionld */
 719     const char *copy;
 720
 721 #ifdef VA_LIST_IS_ARRAY
 722     Py_MEMCPY(count, vargs, sizeof(va_list));
 723 #else
 724 #ifdef  __va_copy
 725     __va_copy(count, vargs);
 726 #else
 727     count = vargs;
 728 #endif
 729 #endif
 730     /* step 1: count the number of %S/%R/%A/%s format specifications
 731      * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
 732      * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
 733      * result in an array) */
 734     for (f = format; *f; f++) {
 735          if (*f == '%') {
 736              if (*(f+1)=='%')
 737                  continue;
 738              if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
 739                  ++callcount;
 740              while (ISDIGIT((unsigned)*f))
 741                  width = (width*10) + *f++ - '0';
 742              while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
 743                  ;
 744              if (*f == 's')
 745                  ++callcount;
 746          }
 747     }
 748     /* step 2: allocate memory for the results of
 749      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 750     if (callcount) {
 751         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 752         if (!callresults) {
 753             PyErr_NoMemory();
 754             return NULL;
 755         }
 756         callresult = callresults;
 757     }
 758     /* step 3: figure out how large a buffer we need */
 759     for (f = format; *f; f++) {
 760         if (*f == '%') {
 761             const char* p = f;
 762             width = 0;
 763             while (ISDIGIT((unsigned)*f))
 764                 width = (width*10) + *f++ - '0';
 765             while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
 766                 ;
 767
 768             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 769              * they don't affect the amount of space we reserve.
 770              */
 771             if ((*f == 'l' || *f == 'z') &&
 772                 (f[1] == 'd' || f[1] == 'u'))
 773                 ++f;
 774
 775             switch (*f) {
 776             case 'c':
 777                 (void)va_arg(count, int);
 778                 /* fall through... */
 779             case '%':
 780                 n++;
 781                 break;
 782             case 'd': case 'u': case 'i': case 'x':
 783                 (void) va_arg(count, int);
 784                 /* 20 bytes is enough to hold a 64-bit
 785                    integer.  Decimal takes the most space.
 786                    This isn't enough for octal.
 787                    If a width is specified we need more
 788                    (which we allocate later). */
 789                 if (width < 20)
 790                     width = 20;
 791                 n += width;
 792                 if (abuffersize < width)
 793                     abuffersize = width;
 794                 break;
 795             case 's':
 796             {
 797                 /* UTF-8 */
 798                 const char *s = va_arg(count, const char*);
 799                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 800                 if (!str)
 801                     goto fail;
 802                 n += PyUnicode_GET_SIZE(str);
 803                 /* Remember the str and switch to the next slot */
 804                 *callresult++ = str;
 805                 break;
 806             }
 807             case 'U':
 808             {
 809                 PyObject *obj = va_arg(count, PyObject *);
 810                 assert(obj && PyUnicode_Check(obj));
 811                 n += PyUnicode_GET_SIZE(obj);
 812                 break;
 813             }
 814             case 'V':
 815             {
 816                 PyObject *obj = va_arg(count, PyObject *);
 817                 const char *str = va_arg(count, const char *);
 818                 assert(obj || str);
 819                 assert(!obj || PyUnicode_Check(obj));
 820                 if (obj)
 821                     n += PyUnicode_GET_SIZE(obj);
 822                 else
 823                     n += strlen(str);
 824                 break;
 825             }
 826             case 'S':
 827             {
 828                 PyObject *obj = va_arg(count, PyObject *);
 829                 PyObject *str;
 830                 assert(obj);
 831                 str = PyObject_Str(obj);
 832                 if (!str)
 833                     goto fail;
 834                 n += PyUnicode_GET_SIZE(str);
 835                 /* Remember the str and switch to the next slot */
 836                 *callresult++ = str;
 837                 break;
 838             }
 839             case 'R':
 840             {
 841                 PyObject *obj = va_arg(count, PyObject *);
 842                 PyObject *repr;
 843                 assert(obj);
 844                 repr = PyObject_Repr(obj);
 845                 if (!repr)
 846                     goto fail;
 847                 n += PyUnicode_GET_SIZE(repr);
 848                 /* Remember the repr and switch to the next slot */
 849                 *callresult++ = repr;
 850                 break;
 851             }
 852             case 'A':
 853             {
 854                 PyObject *obj = va_arg(count, PyObject *);
 855                 PyObject *ascii;
 856                 assert(obj);
 857                 ascii = PyObject_ASCII(obj);
 858                 if (!ascii)
 859                     goto fail;
 860                 n += PyUnicode_GET_SIZE(ascii);
 861                 /* Remember the repr and switch to the next slot */
 862                 *callresult++ = ascii;
 863                 break;
 864             }
 865             case 'p':
 866                 (void) va_arg(count, int);
 867                 /* maximum 64-bit pointer representation:
 868                  * 0xffffffffffffffff
 869                  * so 19 characters is enough.
 870                  * XXX I count 18 -- what's the extra for?
 871                  */
 872                 n += 19;
 873                 break;
 874             default:
 875                 /* if we stumble upon an unknown
 876                    formatting code, copy the rest of
 877                    the format string to the output
 878                    string. (we cannot just skip the
 879                    code, since there's no way to know
 880                    what's in the argument list) */
 881                 n += strlen(p);
 882                 goto expand;
 883             }
 884         } else
 885             n++;
 886     }
 887   expand:
 888     if (abuffersize > 20) {
 889         abuffer = PyObject_Malloc(abuffersize);
 890         if (!abuffer) {
 891             PyErr_NoMemory();
 892             goto fail;
 893         }
 894         realbuffer = abuffer;
 895     }
 896     else
 897         realbuffer = buffer;
 898     /* step 4: fill the buffer */
 899     /* Since we've analyzed how much space we need for the worst case,
 900        we don't have to resize the string.
 901        There can be no errors beyond this point. */
 902     string = PyUnicode_FromUnicode(NULL, n);
 903     if (!string)
 904         goto fail;
 905
 906     s = PyUnicode_AS_UNICODE(string);
 907     callresult = callresults;
 908
 909     for (f = format; *f; f++) {
 910         if (*f == '%') {
 911             const char* p = f++;
 912             int longflag = 0;
 913             int size_tflag = 0;
 914             zeropad = (*f == '0');
 915             /* parse the width.precision part */
 916             width = 0;
 917             while (ISDIGIT((unsigned)*f))
 918                 width = (width*10) + *f++ - '0';
 919             precision = 0;
 920             if (*f == '.') {
 921                 f++;
 922                 while (ISDIGIT((unsigned)*f))
 923                     precision = (precision*10) + *f++ - '0';
 924             }
 925             /* handle the long flag, but only for %ld and %lu.
 926                others can be added when necessary. */
 927             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 928                 longflag = 1;
 929                 ++f;
 930             }
 931             /* handle the size_t flag. */
 932             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 933                 size_tflag = 1;
 934                 ++f;
 935             }
 936
 937             switch (*f) {
 938             case 'c':
 939                 *s++ = va_arg(vargs, int);
 940                 break;
 941             case 'd':
 942                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 943                 if (longflag)
 944                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 945                 else if (size_tflag)
 946                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 947                 else
 948                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 949                 appendstring(realbuffer);
 950                 break;
 951             case 'u':
 952                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 953                 if (longflag)
 954                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 955                 else if (size_tflag)
 956                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 957                 else
 958                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 959                 appendstring(realbuffer);
 960                 break;
 961             case 'i':
 962                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 963                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 964                 appendstring(realbuffer);
 965                 break;
 966             case 'x':
 967                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 968                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 969                 appendstring(realbuffer);
 970                 break;
 971             case 's':
 972             {
 973                 /* unused, since we already have the result */
 974                 (void) va_arg(vargs, char *);
 975                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 976                                 PyUnicode_GET_SIZE(*callresult));
 977                 s += PyUnicode_GET_SIZE(*callresult);
 978                 /* We're done with the unicode()/repr() => forget it */
 979                 Py_DECREF(*callresult);
 980                 /* switch to next unicode()/repr() result */
 981                 ++callresult;
 982                 break;
 983             }
 984             case 'U':
 985             {
 986                 PyObject *obj = va_arg(vargs, PyObject *);
 987                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 988                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 989                 s += size;
 990                 break;
 991             }
 992             case 'V':
 993             {
 994                 PyObject *obj = va_arg(vargs, PyObject *);
 995                 const char *str = va_arg(vargs, const char *);
 996                 if (obj) {
 997                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 998                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 999                     s += size;
1000                 } else {
1001                     appendstring(str);
1002                 }
1003                 break;
1004             }
1005             case 'S':
1006             case 'R':
1007             {
1008                 Py_UNICODE *ucopy;
1009                 Py_ssize_t usize;
1010                 Py_ssize_t upos;
1011                 /* unused, since we already have the result */
1012                 (void) va_arg(vargs, PyObject *);
1013                 ucopy = PyUnicode_AS_UNICODE(*callresult);
1014                 usize = PyUnicode_GET_SIZE(*callresult);
1015                 for (upos = 0; upos<usize;)
1016                     *s++ = ucopy[upos++];
1017                 /* We're done with the unicode()/repr() => forget it */
1018                 Py_DECREF(*callresult);
1019                 /* switch to next unicode()/repr() result */
1020                 ++callresult;
1021                 break;
1022             }
1023             case 'p':
1024                 sprintf(buffer, "%p", va_arg(vargs, void*));
1025                 /* %p is ill-defined:  ensure leading 0x. */
1026                 if (buffer[1] == 'X')
1027                     buffer[1] = 'x';
1028                 else if (buffer[1] != 'x') {
1029                     memmove(buffer+2, buffer, strlen(buffer)+1);
1030                     buffer[0] = '0';
1031                     buffer[1] = 'x';
1032                 }
1033                 appendstring(buffer);
1034                 break;
1035             case '%':
1036                 *s++ = '%';
1037                 break;
1038             default:
1039                 appendstring(p);
1040                 goto end;
1041             }
1042         } else
1043             *s++ = *f;
1044     }
1045
1046   end:
1047     if (callresults)
1048         PyObject_Free(callresults);
1049     if (abuffer)
1050         PyObject_Free(abuffer);
1051     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1052     return string;
1053   fail:
1054     if (callresults) {
1055         PyObject **callresult2 = callresults;
1056         while (callresult2 < callresult) {
1057             Py_DECREF(*callresult2);
1058             ++callresult2;
1059         }
1060         PyObject_Free(callresults);
1061     }
1062     if (abuffer)
1063         PyObject_Free(abuffer);
1064     return NULL;
1065 }
1066
1067 #undef appendstring
1068
1069 PyObject *
1070 PyUnicode_FromFormat(const char *format, ...)
1071 {
1072     PyObject* ret;
1073     va_list vargs;
1074
1075 #ifdef HAVE_STDARG_PROTOTYPES
1076     va_start(vargs, format);
1077 #else
1078     va_start(vargs);
1079 #endif
1080     ret = PyUnicode_FromFormatV(format, vargs);
1081     va_end(vargs);
1082     return ret;
1083 }
1084
1085 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1086                                 wchar_t *w,
1087                                 Py_ssize_t size)
1088 {
1089     if (unicode == NULL) {
1090         PyErr_BadInternalCall();
1091         return -1;
1092     }
1093
1094     /* If possible, try to copy the 0-termination as well */
1095     if (size > PyUnicode_GET_SIZE(unicode))
1096         size = PyUnicode_GET_SIZE(unicode) + 1;
1097
1098 #ifdef HAVE_USABLE_WCHAR_T
1099     memcpy(w, unicode->str, size * sizeof(wchar_t));
1100 #else
1101     {
1102         register Py_UNICODE *u;
1103         register Py_ssize_t i;
1104         u = PyUnicode_AS_UNICODE(unicode);
1105         for (i = size; i > 0; i--)
1106             *w++ = *u++;
1107     }
1108 #endif
1109
1110     if (size > PyUnicode_GET_SIZE(unicode))
1111         return PyUnicode_GET_SIZE(unicode);
1112     else
1113         return size;
1114 }
1115
1116 #endif
1117
1118 PyObject *PyUnicode_FromOrdinal(int ordinal)
1119 {
1120     Py_UNICODE s[2];
1121
1122     if (ordinal < 0 || ordinal > 0x10ffff) {
1123         PyErr_SetString(PyExc_ValueError,
1124                         "chr() arg not in range(0x110000)");
1125         return NULL;
1126     }
1127
1128 #ifndef Py_UNICODE_WIDE
1129     if (ordinal > 0xffff) {
1130         ordinal -= 0x10000;
1131         s[0] = 0xD800 | (ordinal >> 10);
1132         s[1] = 0xDC00 | (ordinal & 0x3FF);
1133         return PyUnicode_FromUnicode(s, 2);
1134     }
1135 #endif
1136
1137     s[0] = (Py_UNICODE)ordinal;
1138     return PyUnicode_FromUnicode(s, 1);
1139 }
1140
1141 PyObject *PyUnicode_FromObject(register PyObject *obj)
1142 {
1143     /* XXX Perhaps we should make this API an alias of
1144        PyObject_Str() instead ?! */
1145     if (PyUnicode_CheckExact(obj)) {
1146         Py_INCREF(obj);
1147         return obj;
1148     }
1149     if (PyUnicode_Check(obj)) {
1150         /* For a Unicode subtype that's not a Unicode object,
1151            return a true Unicode object with the same data. */
1152         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1153                                      PyUnicode_GET_SIZE(obj));
1154     }
1155     PyErr_Format(PyExc_TypeError,
1156                  "Can't convert '%.100s' object to str implicitly",
1157                  Py_TYPE(obj)->tp_name);
1158     return NULL;
1159 }
1160
1161 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1162                                       const char *encoding,
1163                                       const char *errors)
1164 {
1165     const char *s = NULL;
1166     Py_ssize_t len;
1167     PyObject *v;
1168
1169     if (obj == NULL) {
1170         PyErr_BadInternalCall();
1171         return NULL;
1172     }
1173
1174     if (PyUnicode_Check(obj)) {
1175         PyErr_SetString(PyExc_TypeError,
1176                         "decoding str is not supported");
1177         return NULL;
1178     }
1179
1180     /* Coerce object */
1181     if (PyBytes_Check(obj)) {
1182         s = PyBytes_AS_STRING(obj);
1183         len = PyBytes_GET_SIZE(obj);
1184     }
1185     else if (PyByteArray_Check(obj)) {
1186         s = PyByteArray_AS_STRING(obj);
1187         len = PyByteArray_GET_SIZE(obj);
1188     }
1189     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1190         /* Overwrite the error message with something more useful in
1191            case of a TypeError. */
1192         if (PyErr_ExceptionMatches(PyExc_TypeError))
1193             PyErr_Format(PyExc_TypeError,
1194                          "coercing to str: need string or buffer, "
1195                          "%.80s found",
1196                          Py_TYPE(obj)->tp_name);
1197         goto onError;
1198     }
1199
1200     /* Convert to Unicode */
1201     if (len == 0) {
1202         Py_INCREF(unicode_empty);
1203         v = (PyObject *)unicode_empty;
1204     }
1205     else
1206         v = PyUnicode_Decode(s, len, encoding, errors);
1207
1208     return v;
1209
1210   onError:
1211     return NULL;
1212 }
1213
1214 PyObject *PyUnicode_Decode(const char *s,
1215                            Py_ssize_t size,
1216                            const char *encoding,
1217                            const char *errors)
1218 {
1219     PyObject *buffer = NULL, *unicode;
1220     Py_buffer info;
1221     char lower[20];  /* Enough for any encoding name we recognize */
1222     char *l;
1223     const char *e;
1224
1225     if (encoding == NULL)
1226         encoding = PyUnicode_GetDefaultEncoding();
1227
1228     /* Convert encoding to lower case and replace '_' with '-' in order to
1229        catch e.g. UTF_8 */
1230     e = encoding;
1231     l = lower;
1232     while (*e && l < &lower[(sizeof lower) - 2]) {
1233         if (ISUPPER(*e)) {
1234             *l++ = TOLOWER(*e++);
1235         }
1236         else if (*e == '_') {
1237             *l++ = '-';
1238             e++;
1239         }
1240         else {
1241             *l++ = *e++;
1242         }
1243     }
1244     *l = '\0';
1245
1246     /* Shortcuts for common default encodings */
1247     if (strcmp(lower, "utf-8") == 0)
1248         return PyUnicode_DecodeUTF8(s, size, errors);
1249     else if ((strcmp(lower, "latin-1") == 0) ||
1250              (strcmp(lower, "iso-8859-1") == 0))
1251         return PyUnicode_DecodeLatin1(s, size, errors);
1252 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1253     else if (strcmp(lower, "mbcs") == 0)
1254         return PyUnicode_DecodeMBCS(s, size, errors);
1255 #endif
1256     else if (strcmp(lower, "ascii") == 0)
1257         return PyUnicode_DecodeASCII(s, size, errors);
1258     else if (strcmp(lower, "utf-16") == 0)
1259         return PyUnicode_DecodeUTF16(s, size, errors, 0);
1260     else if (strcmp(lower, "utf-32") == 0)
1261         return PyUnicode_DecodeUTF32(s, size, errors, 0);
1262
1263     /* Decode via the codec registry */
1264     buffer = NULL;
1265     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1266         goto onError;
1267     buffer = PyMemoryView_FromBuffer(&info);
1268     if (buffer == NULL)
1269         goto onError;
1270     unicode = PyCodec_Decode(buffer, encoding, errors);
1271     if (unicode == NULL)
1272         goto onError;
1273     if (!PyUnicode_Check(unicode)) {
1274         PyErr_Format(PyExc_TypeError,
1275                      "decoder did not return a str object (type=%.400s)",
1276                      Py_TYPE(unicode)->tp_name);
1277         Py_DECREF(unicode);
1278         goto onError;
1279     }
1280     Py_DECREF(buffer);
1281     return unicode;
1282
1283   onError:
1284     Py_XDECREF(buffer);
1285     return NULL;
1286 }
1287
1288 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1289                                     const char *encoding,
1290                                     const char *errors)
1291 {
1292     PyObject *v;
1293
1294     if (!PyUnicode_Check(unicode)) {
1295         PyErr_BadArgument();
1296         goto onError;
1297     }
1298
1299     if (encoding == NULL)
1300         encoding = PyUnicode_GetDefaultEncoding();
1301
1302     /* Decode via the codec registry */
1303     v = PyCodec_Decode(unicode, encoding, errors);
1304     if (v == NULL)
1305         goto onError;
1306     return v;
1307
1308   onError:
1309     return NULL;
1310 }
1311
1312 PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1313                                      const char *encoding,
1314                                      const char *errors)
1315 {
1316     PyObject *v;
1317
1318     if (!PyUnicode_Check(unicode)) {
1319         PyErr_BadArgument();
1320         goto onError;
1321     }
1322
1323     if (encoding == NULL)
1324         encoding = PyUnicode_GetDefaultEncoding();
1325
1326     /* Decode via the codec registry */
1327     v = PyCodec_Decode(unicode, encoding, errors);
1328     if (v == NULL)
1329         goto onError;
1330     if (!PyUnicode_Check(v)) {
1331         PyErr_Format(PyExc_TypeError,
1332                      "decoder did not return a str object (type=%.400s)",
1333                      Py_TYPE(v)->tp_name);
1334         Py_DECREF(v);
1335         goto onError;
1336     }
1337     return v;
1338
1339   onError:
1340     return NULL;
1341 }
1342
1343 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1344                            Py_ssize_t size,
1345                            const char *encoding,
1346                            const char *errors)
1347 {
1348     PyObject *v, *unicode;
1349
1350     unicode = PyUnicode_FromUnicode(s, size);
1351     if (unicode == NULL)
1352         return NULL;
1353     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1354     Py_DECREF(unicode);
1355     return v;
1356 }
1357
1358 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1359                                     const char *encoding,
1360                                     const char *errors)
1361 {
1362     PyObject *v;
1363
1364     if (!PyUnicode_Check(unicode)) {
1365         PyErr_BadArgument();
1366         goto onError;
1367     }
1368
1369     if (encoding == NULL)
1370         encoding = PyUnicode_GetDefaultEncoding();
1371
1372     /* Encode via the codec registry */
1373     v = PyCodec_Encode(unicode, encoding, errors);
1374     if (v == NULL)
1375         goto onError;
1376     return v;
1377
1378   onError:
1379     return NULL;
1380 }
1381
1382 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1383                                     const char *encoding,
1384                                     const char *errors)
1385 {
1386     PyObject *v;
1387
1388     if (!PyUnicode_Check(unicode)) {
1389         PyErr_BadArgument();
1390         return NULL;
1391     }
1392
1393     if (encoding == NULL)
1394         encoding = PyUnicode_GetDefaultEncoding();
1395
1396     /* Shortcuts for common default encodings */
1397     if (errors == NULL) {
1398         if (strcmp(encoding, "utf-8") == 0)
1399             return PyUnicode_AsUTF8String(unicode);
1400         else if (strcmp(encoding, "latin-1") == 0)
1401             return PyUnicode_AsLatin1String(unicode);
1402 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1403         else if (strcmp(encoding, "mbcs") == 0)
1404             return PyUnicode_AsMBCSString(unicode);
1405 #endif
1406         else if (strcmp(encoding, "ascii") == 0)
1407             return PyUnicode_AsASCIIString(unicode);
1408         /* During bootstrap, we may need to find the encodings
1409            package, to load the file system encoding, and require the
1410            file system encoding in order to load the encodings
1411            package.
1412
1413            Break out of this dependency by assuming that the path to
1414            the encodings module is ASCII-only.  XXX could try wcstombs
1415            instead, if the file system encoding is the locale's
1416            encoding. */
1417         else if (Py_FileSystemDefaultEncoding &&
1418                  strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1419                  !PyThreadState_GET()->interp->codecs_initialized)
1420             return PyUnicode_AsASCIIString(unicode);
1421     }
1422
1423     /* Encode via the codec registry */
1424     v = PyCodec_Encode(unicode, encoding, errors);
1425     if (v == NULL)
1426         return NULL;
1427
1428     /* The normal path */
1429     if (PyBytes_Check(v))
1430         return v;
1431
1432     /* If the codec returns a buffer, raise a warning and convert to bytes */
1433     if (PyByteArray_Check(v)) {
1434         char msg[100];
1435         PyObject *b;
1436         PyOS_snprintf(msg, sizeof(msg),
1437                       "encoder %s returned buffer instead of bytes",
1438                       encoding);
1439         if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1440             Py_DECREF(v);
1441             return NULL;
1442         }
1443
1444         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1445         Py_DECREF(v);
1446         return b;
1447     }
1448
1449     PyErr_Format(PyExc_TypeError,
1450                  "encoder did not return a bytes object (type=%.400s)",
1451                  Py_TYPE(v)->tp_name);
1452     Py_DECREF(v);
1453     return NULL;
1454 }
1455
1456 PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1457                                      const char *encoding,
1458                                      const char *errors)
1459 {
1460     PyObject *v;
1461
1462     if (!PyUnicode_Check(unicode)) {
1463         PyErr_BadArgument();
1464         goto onError;
1465     }
1466
1467     if (encoding == NULL)
1468         encoding = PyUnicode_GetDefaultEncoding();
1469
1470     /* Encode via the codec registry */
1471     v = PyCodec_Encode(unicode, encoding, errors);
1472     if (v == NULL)
1473         goto onError;
1474     if (!PyUnicode_Check(v)) {
1475         PyErr_Format(PyExc_TypeError,
1476                      "encoder did not return an str object (type=%.400s)",
1477                      Py_TYPE(v)->tp_name);
1478         Py_DECREF(v);
1479         goto onError;
1480     }
1481     return v;
1482
1483   onError:
1484     return NULL;
1485 }
1486
1487 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1488                                             const char *errors)
1489 {
1490     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1491     if (v)
1492         return v;
1493     if (errors != NULL)
1494         Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1495     v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1496                              PyUnicode_GET_SIZE(unicode),
1497                              NULL);
1498     if (!v)
1499         return NULL;
1500     ((PyUnicodeObject *)unicode)->defenc = v;
1501     return v;
1502 }
1503
1504 PyObject*
1505 PyUnicode_DecodeFSDefault(const char *s) {
1506     Py_ssize_t size = (Py_ssize_t)strlen(s);
1507     return PyUnicode_DecodeFSDefaultAndSize(s, size);
1508 }
1509
1510 PyObject*
1511 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1512 {
1513     /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1514        can be undefined. If it is case, decode using UTF-8. The following assumes
1515        that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1516        bootstrapping process where the codecs aren't ready yet.
1517     */
1518     if (Py_FileSystemDefaultEncoding) {
1519 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1520         if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1521             return PyUnicode_DecodeMBCS(s, size, "replace");
1522         }
1523 #elif defined(__APPLE__)
1524         if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1525             return PyUnicode_DecodeUTF8(s, size, "replace");
1526         }
1527 #endif
1528         return PyUnicode_Decode(s, size,
1529                                 Py_FileSystemDefaultEncoding,
1530                                 "replace");
1531     }
1532     else {
1533         return PyUnicode_DecodeUTF8(s, size, "replace");
1534     }
1535 }
1536
1537 /* Convert the argument to a bytes object, according to the file
1538    system encoding */
1539
1540 int
1541 PyUnicode_FSConverter(PyObject* arg, void* addr)
1542 {
1543     PyObject *output = NULL;
1544     Py_ssize_t size;
1545     void *data;
1546     if (arg == NULL) {
1547         Py_DECREF(*(PyObject**)addr);
1548         return 1;
1549     }
1550     if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1551         output = arg;
1552         Py_INCREF(output);
1553     }
1554     else {
1555         arg = PyUnicode_FromObject(arg);
1556         if (!arg)
1557             return 0;
1558         output = PyUnicode_AsEncodedObject(arg,
1559                                            Py_FileSystemDefaultEncoding,
1560                                            "surrogateescape");
1561         Py_DECREF(arg);
1562         if (!output)
1563             return 0;
1564         if (!PyBytes_Check(output)) {
1565             Py_DECREF(output);
1566             PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1567             return 0;
1568         }
1569     }
1570     if (PyBytes_Check(output)) {
1571          size = PyBytes_GET_SIZE(output);
1572          data = PyBytes_AS_STRING(output);
1573     }
1574     else {
1575          size = PyByteArray_GET_SIZE(output);
1576          data = PyByteArray_AS_STRING(output);
1577     }
1578     if (size != strlen(data)) {
1579         PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1580         Py_DECREF(output);
1581         return 0;
1582     }
1583     *(PyObject**)addr = output;
1584     return Py_CLEANUP_SUPPORTED;
1585 }
1586
1587
1588 char*
1589 _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1590 {
1591     PyObject *bytes;
1592     if (!PyUnicode_Check(unicode)) {
1593         PyErr_BadArgument();
1594         return NULL;
1595     }
1596     bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1597     if (bytes == NULL)
1598         return NULL;
1599     if (psize != NULL)
1600         *psize = PyBytes_GET_SIZE(bytes);
1601     return PyBytes_AS_STRING(bytes);
1602 }
1603
1604 char*
1605 _PyUnicode_AsString(PyObject *unicode)
1606 {
1607     return _PyUnicode_AsStringAndSize(unicode, NULL);
1608 }
1609
1610 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1611 {
1612     if (!PyUnicode_Check(unicode)) {
1613         PyErr_BadArgument();
1614         goto onError;
1615     }
1616     return PyUnicode_AS_UNICODE(unicode);
1617
1618   onError:
1619     return NULL;
1620 }
1621
1622 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1623 {
1624     if (!PyUnicode_Check(unicode)) {
1625         PyErr_BadArgument();
1626         goto onError;
1627     }
1628     return PyUnicode_GET_SIZE(unicode);
1629
1630   onError:
1631     return -1;
1632 }
1633
1634 const char *PyUnicode_GetDefaultEncoding(void)
1635 {
1636     return unicode_default_encoding;
1637 }
1638
1639 int PyUnicode_SetDefaultEncoding(const char *encoding)
1640 {
1641     if (strcmp(encoding, unicode_default_encoding) != 0) {
1642         PyErr_Format(PyExc_ValueError,
1643                      "Can only set default encoding to %s",
1644                      unicode_default_encoding);
1645         return -1;
1646     }
1647     return 0;
1648 }
1649
1650 /* error handling callback helper:
1651    build arguments, call the callback and check the arguments,
1652    if no exception occurred, copy the replacement to the output
1653    and adjust various state variables.
1654    return 0 on success, -1 on error
1655 */
1656
1657 static
1658 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1659                                      const char *encoding, const char *reason,
1660                                      const char **input, const char **inend, Py_ssize_t *startinpos,
1661                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1662                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1663 {
1664     static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1665
1666     PyObject *restuple = NULL;
1667     PyObject *repunicode = NULL;
1668     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1669     Py_ssize_t insize;
1670     Py_ssize_t requiredsize;
1671     Py_ssize_t newpos;
1672     Py_UNICODE *repptr;
1673     PyObject *inputobj = NULL;
1674     Py_ssize_t repsize;
1675     int res = -1;
1676
1677     if (*errorHandler == NULL) {
1678         *errorHandler = PyCodec_LookupError(errors);
1679         if (*errorHandler == NULL)
1680             goto onError;
1681     }
1682
1683     if (*exceptionObject == NULL) {
1684         *exceptionObject = PyUnicodeDecodeError_Create(
1685             encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1686         if (*exceptionObject == NULL)
1687             goto onError;
1688     }
1689     else {
1690         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1691             goto onError;
1692         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1693             goto onError;
1694         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1695             goto onError;
1696     }
1697
1698     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1699     if (restuple == NULL)
1700         goto onError;
1701     if (!PyTuple_Check(restuple)) {
1702         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1703         goto onError;
1704     }
1705     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1706         goto onError;
1707
1708     /* Copy back the bytes variables, which might have been modified by the
1709        callback */
1710     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1711     if (!inputobj)
1712         goto onError;
1713     if (!PyBytes_Check(inputobj)) {
1714         PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1715     }
1716     *input = PyBytes_AS_STRING(inputobj);
1717     insize = PyBytes_GET_SIZE(inputobj);
1718     *inend = *input + insize;
1719     /* we can DECREF safely, as the exception has another reference,
1720        so the object won't go away. */
1721     Py_DECREF(inputobj);
1722
1723     if (newpos<0)
1724         newpos = insize+newpos;
1725     if (newpos<0 || newpos>insize) {
1726         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1727         goto onError;
1728     }
1729
1730     /* need more space? (at least enough for what we
1731        have+the replacement+the rest of the string (starting
1732        at the new input position), so we won't have to check space
1733        when there are no errors in the rest of the string) */
1734     repptr = PyUnicode_AS_UNICODE(repunicode);
1735     repsize = PyUnicode_GET_SIZE(repunicode);
1736     requiredsize = *outpos + repsize + insize-newpos;
1737     if (requiredsize > outsize) {
1738         if (requiredsize<2*outsize)
1739             requiredsize = 2*outsize;
1740         if (_PyUnicode_Resize(output, requiredsize) < 0)
1741             goto onError;
1742         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1743     }
1744     *endinpos = newpos;
1745     *inptr = *input + newpos;
1746     Py_UNICODE_COPY(*outptr, repptr, repsize);
1747     *outptr += repsize;
1748     *outpos += repsize;
1749
1750     /* we made it! */
1751     res = 0;
1752
1753   onError:
1754     Py_XDECREF(restuple);
1755     return res;
1756 }
1757
1758 /* --- UTF-7 Codec -------------------------------------------------------- */
1759
1760 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1761
1762 /* Three simple macros defining base-64. */
1763
1764 /* Is c a base-64 character? */
1765
1766 #define IS_BASE64(c) \
1767     (((c) >= 'A' && (c) <= 'Z') ||     \
1768      ((c) >= 'a' && (c) <= 'z') ||     \
1769      ((c) >= '0' && (c) <= '9') ||     \
1770      (c) == '+' || (c) == '/')
1771
1772 /* given that c is a base-64 character, what is its base-64 value? */
1773
1774 #define FROM_BASE64(c)                                                  \
1775     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1776      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1777      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1778      (c) == '+' ? 62 : 63)
1779
1780 /* What is the base-64 character of the bottom 6 bits of n? */
1781
1782 #define TO_BASE64(n)  \
1783     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1784
1785 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1786  * decoded as itself.  We are permissive on decoding; the only ASCII
1787  * byte not decoding to itself is the + which begins a base64
1788  * string. */
1789
1790 #define DECODE_DIRECT(c)                                \
1791     ((c) <= 127 && (c) != '+')
1792
1793 /* The UTF-7 encoder treats ASCII characters differently according to
1794  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1795  * the above).  See RFC2152.  This array identifies these different
1796  * sets:
1797  * 0 : "Set D"
1798  *     alphanumeric and '(),-./:?
1799  * 1 : "Set O"
1800  *     !"#$%&*;<=>@[]^_`{|}
1801  * 2 : "whitespace"
1802  *     ht nl cr sp
1803  * 3 : special (must be base64 encoded)
1804  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1805  */
1806
1807 static
1808 char utf7_category[128] = {
1809 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1810     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1811 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1812     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1813 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1814     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1815 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1816     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1817 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1818     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1819 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1820     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1821 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1822     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1823 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1824     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1825 };
1826
1827 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1828  * answer depends on whether we are encoding set O as itself, and also
1829  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1830  * clear that the answers to these questions vary between
1831  * applications, so this code needs to be flexible.  */
1832
1833 #define ENCODE_DIRECT(c, directO, directWS)             \
1834     ((c) < 128 && (c) > 0 &&                            \
1835      ((utf7_category[(c)] == 0) ||                      \
1836       (directWS && (utf7_category[(c)] == 2)) ||        \
1837       (directO && (utf7_category[(c)] == 1))))
1838
1839 PyObject *PyUnicode_DecodeUTF7(const char *s,
1840                                Py_ssize_t size,
1841                                const char *errors)
1842 {
1843     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1844 }
1845
1846 /* The decoder.  The only state we preserve is our read position,
1847  * i.e. how many characters we have consumed.  So if we end in the
1848  * middle of a shift sequence we have to back off the read position
1849  * and the output to the beginning of the sequence, otherwise we lose
1850  * all the shift state (seen bits, number of bits seen, high
1851  * surrogate). */
1852
1853 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1854                                        Py_ssize_t size,
1855                                        const char *errors,
1856                                        Py_ssize_t *consumed)
1857 {
1858     const char *starts = s;
1859     Py_ssize_t startinpos;
1860     Py_ssize_t endinpos;
1861     Py_ssize_t outpos;
1862     const char *e;
1863     PyUnicodeObject *unicode;
1864     Py_UNICODE *p;
1865     const char *errmsg = "";
1866     int inShift = 0;
1867     Py_UNICODE *shiftOutStart;
1868     unsigned int base64bits = 0;
1869     unsigned long base64buffer = 0;
1870     Py_UNICODE surrogate = 0;
1871     PyObject *errorHandler = NULL;
1872     PyObject *exc = NULL;
1873
1874     unicode = _PyUnicode_New(size);
1875     if (!unicode)
1876         return NULL;
1877     if (size == 0) {
1878         if (consumed)
1879             *consumed = 0;
1880         return (PyObject *)unicode;
1881     }
1882
1883     p = unicode->str;
1884     shiftOutStart = p;
1885     e = s + size;
1886
1887     while (s < e) {
1888         Py_UNICODE ch;
1889       restart:
1890         ch = (unsigned char) *s;
1891
1892         if (inShift) { /* in a base-64 section */
1893             if (IS_BASE64(ch)) { /* consume a base-64 character */
1894                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1895                 base64bits += 6;
1896                 s++;
1897                 if (base64bits >= 16) {
1898                     /* we have enough bits for a UTF-16 value */
1899                     Py_UNICODE outCh = (Py_UNICODE)
1900                                        (base64buffer >> (base64bits-16));
1901                     base64bits -= 16;
1902                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1903                     if (surrogate) {
1904                         /* expecting a second surrogate */
1905                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1906 #ifdef Py_UNICODE_WIDE
1907                             *p++ = (((surrogate & 0x3FF)<<10)
1908                                     | (outCh & 0x3FF)) + 0x10000;
1909 #else
1910                             *p++ = surrogate;
1911                             *p++ = outCh;
1912 #endif
1913                             surrogate = 0;
1914                         }
1915                         else {
1916                             surrogate = 0;
1917                             errmsg = "second surrogate missing";
1918                             goto utf7Error;
1919                         }
1920                     }
1921                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1922                         /* first surrogate */
1923                         surrogate = outCh;
1924                     }
1925                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1926                         errmsg = "unexpected second surrogate";
1927                         goto utf7Error;
1928                     }
1929                     else {
1930                         *p++ = outCh;
1931                     }
1932                 }
1933             }
1934             else { /* now leaving a base-64 section */
1935                 inShift = 0;
1936                 s++;
1937                 if (surrogate) {
1938                     errmsg = "second surrogate missing at end of shift sequence";
1939                     goto utf7Error;
1940                 }
1941                 if (base64bits > 0) { /* left-over bits */
1942                     if (base64bits >= 6) {
1943                         /* We've seen at least one base-64 character */
1944                         errmsg = "partial character in shift sequence";
1945                         goto utf7Error;
1946                     }
1947                     else {
1948                         /* Some bits remain; they should be zero */
1949                         if (base64buffer != 0) {
1950                             errmsg = "non-zero padding bits in shift sequence";
1951                             goto utf7Error;
1952                         }
1953                     }
1954                 }
1955                 if (ch != '-') {
1956                     /* '-' is absorbed; other terminating
1957                        characters are preserved */
1958                     *p++ = ch;
1959                 }
1960             }
1961         }
1962         else if ( ch == '+' ) {
1963             startinpos = s-starts;
1964             s++; /* consume '+' */
1965             if (s < e && *s == '-') { /* '+-' encodes '+' */
1966                 s++;
1967                 *p++ = '+';
1968             }
1969             else { /* begin base64-encoded section */
1970                 inShift = 1;
1971                 shiftOutStart = p;
1972                 base64bits = 0;
1973             }
1974         }
1975         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1976             *p++ = ch;
1977             s++;
1978         }
1979         else {
1980             startinpos = s-starts;
1981             s++;
1982             errmsg = "unexpected special character";
1983             goto utf7Error;
1984         }
1985         continue;
1986 utf7Error:
1987         outpos = p-PyUnicode_AS_UNICODE(unicode);
1988         endinpos = s-starts;
1989         if (unicode_decode_call_errorhandler(
1990                 errors, &errorHandler,
1991                 "utf7", errmsg,
1992                 &starts, &e, &startinpos, &endinpos, &exc, &s,
1993                 &unicode, &outpos, &p))
1994             goto onError;
1995     }
1996
1997     /* end of string */
1998
1999     if (inShift && !consumed) { /* in shift sequence, no more to follow */
2000         /* if we're in an inconsistent state, that's an error */
2001         if (surrogate ||
2002                 (base64bits >= 6) ||
2003                 (base64bits > 0 && base64buffer != 0)) {
2004             outpos = p-PyUnicode_AS_UNICODE(unicode);
2005             endinpos = size;
2006             if (unicode_decode_call_errorhandler(
2007                     errors, &errorHandler,
2008                     "utf7", "unterminated shift sequence",
2009                     &starts, &e, &startinpos, &endinpos, &exc, &s,
2010                     &unicode, &outpos, &p))
2011                 goto onError;
2012             if (s < e)
2013                 goto restart;
2014         }
2015     }
2016
2017     /* return state */
2018     if (consumed) {
2019         if (inShift) {
2020             p = shiftOutStart; /* back off output */
2021             *consumed = startinpos;
2022         }
2023         else {
2024             *consumed = s-starts;
2025         }
2026     }
2027
2028     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2029         goto onError;
2030
2031     Py_XDECREF(errorHandler);
2032     Py_XDECREF(exc);
2033     return (PyObject *)unicode;
2034
2035   onError:
2036     Py_XDECREF(errorHandler);
2037     Py_XDECREF(exc);
2038     Py_DECREF(unicode);
2039     return NULL;
2040 }
2041
2042
2043 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2044                                Py_ssize_t size,
2045                                int base64SetO,
2046                                int base64WhiteSpace,
2047                                const char *errors)
2048 {
2049     PyObject *v;
2050     /* It might be possible to tighten this worst case */
2051     Py_ssize_t allocated = 8 * size;
2052     int inShift = 0;
2053     Py_ssize_t i = 0;
2054     unsigned int base64bits = 0;
2055     unsigned long base64buffer = 0;
2056     char * out;
2057     char * start;
2058
2059     if (size == 0)
2060         return PyBytes_FromStringAndSize(NULL, 0);
2061
2062     if (allocated / 8 != size)
2063         return PyErr_NoMemory();
2064
2065     v = PyBytes_FromStringAndSize(NULL, allocated);
2066     if (v == NULL)
2067         return NULL;
2068
2069     start = out = PyBytes_AS_STRING(v);
2070     for (;i < size; ++i) {
2071         Py_UNICODE ch = s[i];
2072
2073         if (inShift) {
2074             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2075                 /* shifting out */
2076                 if (base64bits) { /* output remaining bits */
2077                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
2078                     base64buffer = 0;
2079                     base64bits = 0;
2080                 }
2081                 inShift = 0;
2082                 /* Characters not in the BASE64 set implicitly unshift the sequence
2083                    so no '-' is required, except if the character is itself a '-' */
2084                 if (IS_BASE64(ch) || ch == '-') {
2085                     *out++ = '-';
2086                 }
2087                 *out++ = (char) ch;
2088             }
2089             else {
2090                 goto encode_char;
2091             }
2092         }
2093         else { /* not in a shift sequence */
2094             if (ch == '+') {
2095                 *out++ = '+';
2096                         *out++ = '-';
2097             }
2098             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2099                 *out++ = (char) ch;
2100             }
2101             else {
2102                 *out++ = '+';
2103                 inShift = 1;
2104                 goto encode_char;
2105             }
2106         }
2107         continue;
2108 encode_char:
2109 #ifdef Py_UNICODE_WIDE
2110         if (ch >= 0x10000) {
2111             /* code first surrogate */
2112             base64bits += 16;
2113             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2114             while (base64bits >= 6) {
2115                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2116                 base64bits -= 6;
2117             }
2118             /* prepare second surrogate */
2119             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2120         }
2121 #endif
2122         base64bits += 16;
2123         base64buffer = (base64buffer << 16) | ch;
2124         while (base64bits >= 6) {
2125             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2126             base64bits -= 6;
2127         }
2128     }
2129     if (base64bits)
2130         *out++= TO_BASE64(base64buffer << (6-base64bits) );
2131     if (inShift)
2132         *out++ = '-';
2133     if (_PyBytes_Resize(&v, out - start) < 0)
2134         return NULL;
2135     return v;
2136 }
2137
2138 #undef IS_BASE64
2139 #undef FROM_BASE64
2140 #undef TO_BASE64
2141 #undef DECODE_DIRECT
2142 #undef ENCODE_DIRECT
2143
2144 /* --- UTF-8 Codec -------------------------------------------------------- */
2145
2146 static
2147 char utf8_code_length[256] = {
2148     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2149        illegal prefix.  See RFC 3629 for details */
2150     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2151     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2152     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2155     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2156     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2157     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2158     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2159     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2160     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2161     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2162     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2163     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2164     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2165     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2166 };
2167
2168 PyObject *PyUnicode_DecodeUTF8(const char *s,
2169                                Py_ssize_t size,
2170                                const char *errors)
2171 {
2172     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2173 }
2174
2175 /* Mask to check or force alignment of a pointer to C 'long' boundaries */
2176 #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2177
2178 /* Mask to quickly check whether a C 'long' contains a
2179    non-ASCII, UTF8-encoded char. */
2180 #if (SIZEOF_LONG == 8)
2181 # define ASCII_CHAR_MASK 0x8080808080808080L
2182 #elif (SIZEOF_LONG == 4)
2183 # define ASCII_CHAR_MASK 0x80808080L
2184 #else
2185 # error C 'long' size should be either 4 or 8!
2186 #endif
2187
2188 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2189                                        Py_ssize_t size,
2190                                        const char *errors,
2191                                        Py_ssize_t *consumed)
2192 {
2193     const char *starts = s;
2194     int n;
2195     int k;
2196     Py_ssize_t startinpos;
2197     Py_ssize_t endinpos;
2198     Py_ssize_t outpos;
2199     const char *e, *aligned_end;
2200     PyUnicodeObject *unicode;
2201     Py_UNICODE *p;
2202     const char *errmsg = "";
2203     PyObject *errorHandler = NULL;
2204     PyObject *exc = NULL;
2205
2206     /* Note: size will always be longer than the resulting Unicode
2207        character count */
2208     unicode = _PyUnicode_New(size);
2209     if (!unicode)
2210         return NULL;
2211     if (size == 0) {
2212         if (consumed)
2213             *consumed = 0;
2214         return (PyObject *)unicode;
2215     }
2216
2217     /* Unpack UTF-8 encoded data */
2218     p = unicode->str;
2219     e = s + size;
2220     aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2221
2222     while (s < e) {
2223         Py_UCS4 ch = (unsigned char)*s;
2224
2225         if (ch < 0x80) {
2226             /* Fast path for runs of ASCII characters. Given that common UTF-8
2227                input will consist of an overwhelming majority of ASCII
2228                characters, we try to optimize for this case by checking
2229                as many characters as a C 'long' can contain.
2230                First, check if we can do an aligned read, as most CPUs have
2231                a penalty for unaligned reads.
2232             */
2233             if (!((size_t) s & LONG_PTR_MASK)) {
2234                 /* Help register allocation */
2235                 register const char *_s = s;
2236                 register Py_UNICODE *_p = p;
2237                 while (_s < aligned_end) {
2238                     /* Read a whole long at a time (either 4 or 8 bytes),
2239                        and do a fast unrolled copy if it only contains ASCII
2240                        characters. */
2241                     unsigned long data = *(unsigned long *) _s;
2242                     if (data & ASCII_CHAR_MASK)
2243                         break;
2244                     _p[0] = (unsigned char) _s[0];
2245                     _p[1] = (unsigned char) _s[1];
2246                     _p[2] = (unsigned char) _s[2];
2247                     _p[3] = (unsigned char) _s[3];
2248 #if (SIZEOF_LONG == 8)
2249                     _p[4] = (unsigned char) _s[4];
2250                     _p[5] = (unsigned char) _s[5];
2251                     _p[6] = (unsigned char) _s[6];
2252                     _p[7] = (unsigned char) _s[7];
2253 #endif
2254                     _s += SIZEOF_LONG;
2255                     _p += SIZEOF_LONG;
2256                 }
2257                 s = _s;
2258                 p = _p;
2259                 if (s == e)
2260                     break;
2261                 ch = (unsigned char)*s;
2262             }
2263         }
2264
2265         if (ch < 0x80) {
2266             *p++ = (Py_UNICODE)ch;
2267             s++;
2268             continue;
2269         }
2270
2271         n = utf8_code_length[ch];
2272
2273         if (s + n > e) {
2274             if (consumed)
2275                 break;
2276             else {
2277                 errmsg = "unexpected end of data";
2278                 startinpos = s-starts;
2279                 endinpos = startinpos+1;
2280                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2281                     endinpos++;
2282                 goto utf8Error;
2283             }
2284         }
2285
2286         switch (n) {
2287
2288         case 0:
2289             errmsg = "invalid start byte";
2290             startinpos = s-starts;
2291             endinpos = startinpos+1;
2292             goto utf8Error;
2293
2294         case 1:
2295             errmsg = "internal error";
2296             startinpos = s-starts;
2297             endinpos = startinpos+1;
2298             goto utf8Error;
2299
2300         case 2:
2301             if ((s[1] & 0xc0) != 0x80) {
2302                 errmsg = "invalid continuation byte";
2303                 startinpos = s-starts;
2304                 endinpos = startinpos + 1;
2305                 goto utf8Error;
2306             }
2307             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2308             assert ((ch > 0x007F) && (ch <= 0x07FF));
2309             *p++ = (Py_UNICODE)ch;
2310             break;
2311
2312         case 3:
2313             /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2314                will result in surrogates in range d800-dfff. Surrogates are
2315                not valid UTF-8 so they are rejected.
2316                See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2317                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2318             if ((s[1] & 0xc0) != 0x80 ||
2319                 (s[2] & 0xc0) != 0x80 ||
2320                 ((unsigned char)s[0] == 0xE0 &&
2321                  (unsigned char)s[1] < 0xA0) ||
2322                 ((unsigned char)s[0] == 0xED &&
2323                  (unsigned char)s[1] > 0x9F)) {
2324                 errmsg = "invalid continuation byte";
2325                 startinpos = s-starts;
2326                 endinpos = startinpos + 1;
2327
2328                 /* if s[1] first two bits are 1 and 0, then the invalid
2329                    continuation byte is s[2], so increment endinpos by 1,
2330                    if not, s[1] is invalid and endinpos doesn't need to
2331                    be incremented. */
2332                 if ((s[1] & 0xC0) == 0x80)
2333                     endinpos++;
2334                 goto utf8Error;
2335             }
2336             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2337             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2338             *p++ = (Py_UNICODE)ch;
2339             break;
2340
2341         case 4:
2342             if ((s[1] & 0xc0) != 0x80 ||
2343                 (s[2] & 0xc0) != 0x80 ||
2344                 (s[3] & 0xc0) != 0x80 ||
2345                 ((unsigned char)s[0] == 0xF0 &&
2346                  (unsigned char)s[1] < 0x90) ||
2347                 ((unsigned char)s[0] == 0xF4 &&
2348                  (unsigned char)s[1] > 0x8F)) {
2349                 errmsg = "invalid continuation byte";
2350                 startinpos = s-starts;
2351                 endinpos = startinpos + 1;
2352                 if ((s[1] & 0xC0) == 0x80) {
2353                     endinpos++;
2354                     if ((s[2] & 0xC0) == 0x80)
2355                         endinpos++;
2356                 }
2357                 goto utf8Error;
2358             }
2359             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2360                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2361             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2362
2363 #ifdef Py_UNICODE_WIDE
2364             *p++ = (Py_UNICODE)ch;
2365 #else
2366             /*  compute and append the two surrogates: */
2367
2368             /*  translate from 10000..10FFFF to 0..FFFF */
2369             ch -= 0x10000;
2370
2371             /*  high surrogate = top 10 bits added to D800 */
2372             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2373
2374             /*  low surrogate = bottom 10 bits added to DC00 */
2375             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2376 #endif
2377             break;
2378         }
2379         s += n;
2380         continue;
2381
2382       utf8Error:
2383         outpos = p-PyUnicode_AS_UNICODE(unicode);
2384         if (unicode_decode_call_errorhandler(
2385                 errors, &errorHandler,
2386                 "utf8", errmsg,
2387                 &starts, &e, &startinpos, &endinpos, &exc, &s,
2388                 &unicode, &outpos, &p))
2389             goto onError;
2390         aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2391     }
2392     if (consumed)
2393         *consumed = s-starts;
2394
2395     /* Adjust length */
2396     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2397         goto onError;
2398
2399     Py_XDECREF(errorHandler);
2400     Py_XDECREF(exc);
2401     return (PyObject *)unicode;
2402
2403   onError:
2404     Py_XDECREF(errorHandler);
2405     Py_XDECREF(exc);
2406     Py_DECREF(unicode);
2407     return NULL;
2408 }
2409
2410 #undef ASCII_CHAR_MASK
2411
2412
2413 /* Allocation strategy:  if the string is short, convert into a stack buffer
2414    and allocate exactly as much space needed at the end.  Else allocate the
2415    maximum possible needed (4 result bytes per Unicode character), and return
2416    the excess memory at the end.
2417 */
2418 PyObject *
2419 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2420                      Py_ssize_t size,
2421                      const char *errors)
2422 {
2423 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2424
2425     Py_ssize_t i;                /* index into s of next input byte */
2426     PyObject *result;            /* result string object */
2427     char *p;                     /* next free byte in output buffer */
2428     Py_ssize_t nallocated;      /* number of result bytes allocated */
2429     Py_ssize_t nneeded;            /* number of result bytes needed */
2430     char stackbuf[MAX_SHORT_UNICHARS * 4];
2431     PyObject *errorHandler = NULL;
2432     PyObject *exc = NULL;
2433
2434     assert(s != NULL);
2435     assert(size >= 0);
2436
2437     if (size <= MAX_SHORT_UNICHARS) {
2438         /* Write into the stack buffer; nallocated can't overflow.
2439          * At the end, we'll allocate exactly as much heap space as it
2440          * turns out we need.
2441          */
2442         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2443         result = NULL;   /* will allocate after we're done */
2444         p = stackbuf;
2445     }
2446     else {
2447         /* Overallocate on the heap, and give the excess back at the end. */
2448         nallocated = size * 4;
2449         if (nallocated / 4 != size)  /* overflow! */
2450             return PyErr_NoMemory();
2451         result = PyBytes_FromStringAndSize(NULL, nallocated);
2452         if (result == NULL)
2453             return NULL;
2454         p = PyBytes_AS_STRING(result);
2455     }
2456
2457     for (i = 0; i < size;) {
2458         Py_UCS4 ch = s[i++];
2459
2460         if (ch < 0x80)
2461             /* Encode ASCII */
2462             *p++ = (char) ch;
2463
2464         else if (ch < 0x0800) {
2465             /* Encode Latin-1 */
2466             *p++ = (char)(0xc0 | (ch >> 6));
2467             *p++ = (char)(0x80 | (ch & 0x3f));
2468         } else if (0xD800 <= ch && ch <= 0xDFFF) {
2469 #ifndef Py_UNICODE_WIDE
2470             /* Special case: check for high and low surrogate */
2471             if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2472                 Py_UCS4 ch2 = s[i];
2473                 /* Combine the two surrogates to form a UCS4 value */
2474                 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2475                 i++;
2476
2477                 /* Encode UCS4 Unicode ordinals */
2478                 *p++ = (char)(0xf0 | (ch >> 18));
2479                 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2480                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2481                 *p++ = (char)(0x80 | (ch & 0x3f));
2482             } else {
2483 #endif
2484                 Py_ssize_t newpos;
2485                 PyObject *rep;
2486                 Py_ssize_t repsize, k;
2487                 rep = unicode_encode_call_errorhandler
2488                     (errors, &errorHandler, "utf-8", "surrogates not allowed",
2489                      s, size, &exc, i-1, i, &newpos);
2490                 if (!rep)
2491                     goto error;
2492
2493                 if (PyBytes_Check(rep))
2494                     repsize = PyBytes_GET_SIZE(rep);
2495                 else
2496                     repsize = PyUnicode_GET_SIZE(rep);
2497
2498                 if (repsize > 4) {
2499                     Py_ssize_t offset;
2500
2501                     if (result == NULL)
2502                         offset = p - stackbuf;
2503                     else
2504                         offset = p - PyBytes_AS_STRING(result);
2505
2506                     if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2507                         /* integer overflow */
2508                         PyErr_NoMemory();
2509                         goto error;
2510                     }
2511                     nallocated += repsize - 4;
2512                     if (result != NULL) {
2513                         if (_PyBytes_Resize(&result, nallocated) < 0)
2514                             goto error;
2515                     } else {
2516                         result = PyBytes_FromStringAndSize(NULL, nallocated);
2517                         if (result == NULL)
2518                             goto error;
2519                         Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2520                     }
2521                     p = PyBytes_AS_STRING(result) + offset;
2522                 }
2523
2524                 if (PyBytes_Check(rep)) {
2525                     char *prep = PyBytes_AS_STRING(rep);
2526                     for(k = repsize; k > 0; k--)
2527                         *p++ = *prep++;
2528                 } else /* rep is unicode */ {
2529                     Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2530                     Py_UNICODE c;
2531
2532                     for(k=0; k<repsize; k++) {
2533                         c = prep[k];
2534                         if (0x80 <= c) {
2535                             raise_encode_exception(&exc, "utf-8", s, size,
2536                                                    i-1, i, "surrogates not allowed");
2537                             goto error;
2538                         }
2539                         *p++ = (char)prep[k];
2540                     }
2541                 }
2542                 Py_DECREF(rep);
2543 #ifndef Py_UNICODE_WIDE
2544             }
2545 #endif
2546         } else if (ch < 0x10000) {
2547             *p++ = (char)(0xe0 | (ch >> 12));
2548             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2549             *p++ = (char)(0x80 | (ch & 0x3f));
2550         } else /* ch >= 0x10000 */ {
2551             /* Encode UCS4 Unicode ordinals */
2552             *p++ = (char)(0xf0 | (ch >> 18));
2553             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2554             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2555             *p++ = (char)(0x80 | (ch & 0x3f));
2556         }
2557     }
2558
2559     if (result == NULL) {
2560         /* This was stack allocated. */
2561         nneeded = p - stackbuf;
2562         assert(nneeded <= nallocated);
2563         result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2564     }
2565     else {
2566         /* Cut back to size actually needed. */
2567         nneeded = p - PyBytes_AS_STRING(result);
2568         assert(nneeded <= nallocated);
2569         _PyBytes_Resize(&result, nneeded);
2570     }
2571     Py_XDECREF(errorHandler);
2572     Py_XDECREF(exc);
2573     return result;
2574  error:
2575     Py_XDECREF(errorHandler);
2576     Py_XDECREF(exc);
2577     Py_XDECREF(result);
2578     return NULL;
2579
2580 #undef MAX_SHORT_UNICHARS
2581 }
2582
2583 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2584 {
2585     if (!PyUnicode_Check(unicode)) {
2586         PyErr_BadArgument();
2587         return NULL;
2588     }
2589     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2590                                 PyUnicode_GET_SIZE(unicode),
2591                                 NULL);
2592 }
2593
2594 /* --- UTF-32 Codec ------------------------------------------------------- */
2595
2596 PyObject *
2597 PyUnicode_DecodeUTF32(const char *s,
2598                       Py_ssize_t size,
2599                       const char *errors,
2600                       int *byteorder)
2601 {
2602     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2603 }
2604
2605 PyObject *
2606 PyUnicode_DecodeUTF32Stateful(const char *s,
2607                               Py_ssize_t size,
2608                               const char *errors,
2609                               int *byteorder,
2610                               Py_ssize_t *consumed)
2611 {
2612     const char *starts = s;
2613     Py_ssize_t startinpos;
2614     Py_ssize_t endinpos;
2615     Py_ssize_t outpos;
2616     PyUnicodeObject *unicode;
2617     Py_UNICODE *p;
2618 #ifndef Py_UNICODE_WIDE
2619     int pairs = 0;
2620 #else
2621     const int pairs = 0;
2622 #endif
2623     const unsigned char *q, *e, *qq;
2624     int bo = 0;       /* assume native ordering by default */
2625     const char *errmsg = "";
2626     /* Offsets from q for retrieving bytes in the right order. */
2627 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2628     int iorder[] = {0, 1, 2, 3};
2629 #else
2630     int iorder[] = {3, 2, 1, 0};
2631 #endif
2632     PyObject *errorHandler = NULL;
2633     PyObject *exc = NULL;
2634
2635     q = (unsigned char *)s;
2636     e = q + size;
2637
2638     if (byteorder)
2639         bo = *byteorder;
2640
2641     /* Check for BOM marks (U+FEFF) in the input and adjust current
2642        byte order setting accordingly. In native mode, the leading BOM
2643        mark is skipped, in all other modes, it is copied to the output
2644        stream as-is (giving a ZWNBSP character). */
2645     if (bo == 0) {
2646         if (size >= 4) {
2647             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2648                 (q[iorder[1]] << 8) | q[iorder[0]];
2649 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2650             if (bom == 0x0000FEFF) {
2651                 q += 4;
2652                 bo = -1;
2653             }
2654             else if (bom == 0xFFFE0000) {
2655                 q += 4;
2656                 bo = 1;
2657             }
2658 #else
2659             if (bom == 0x0000FEFF) {
2660                 q += 4;
2661                 bo = 1;
2662             }
2663             else if (bom == 0xFFFE0000) {
2664                 q += 4;
2665                 bo = -1;
2666             }
2667 #endif
2668         }
2669     }
2670
2671     if (bo == -1) {
2672         /* force LE */
2673         iorder[0] = 0;
2674         iorder[1] = 1;
2675         iorder[2] = 2;
2676         iorder[3] = 3;
2677     }
2678     else if (bo == 1) {
2679         /* force BE */
2680         iorder[0] = 3;
2681         iorder[1] = 2;
2682         iorder[2] = 1;
2683         iorder[3] = 0;
2684     }
2685
2686     /* On narrow builds we split characters outside the BMP into two
2687        codepoints => count how much extra space we need. */
2688 #ifndef Py_UNICODE_WIDE
2689     for (qq = q; qq < e; qq += 4)
2690         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2691             pairs++;
2692 #endif
2693
2694     /* This might be one to much, because of a BOM */
2695     unicode = _PyUnicode_New((size+3)/4+pairs);
2696     if (!unicode)
2697         return NULL;
2698     if (size == 0)
2699         return (PyObject *)unicode;
2700
2701     /* Unpack UTF-32 encoded data */
2702     p = unicode->str;
2703
2704     while (q < e) {
2705         Py_UCS4 ch;
2706         /* remaining bytes at the end? (size should be divisible by 4) */
2707         if (e-q<4) {
2708             if (consumed)
2709                 break;
2710             errmsg = "truncated data";
2711             startinpos = ((const char *)q)-starts;
2712             endinpos = ((const char *)e)-starts;
2713             goto utf32Error;
2714             /* The remaining input chars are ignored if the callback
2715                chooses to skip the input */
2716         }
2717         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2718             (q[iorder[1]] << 8) | q[iorder[0]];
2719
2720         if (ch >= 0x110000)
2721         {
2722             errmsg = "codepoint not in range(0x110000)";
2723             startinpos = ((const char *)q)-starts;
2724             endinpos = startinpos+4;
2725             goto utf32Error;
2726         }
2727 #ifndef Py_UNICODE_WIDE
2728         if (ch >= 0x10000)
2729         {
2730             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2731             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2732         }
2733         else
2734 #endif
2735             *p++ = ch;
2736         q += 4;
2737         continue;
2738       utf32Error:
2739         outpos = p-PyUnicode_AS_UNICODE(unicode);
2740         if (unicode_decode_call_errorhandler(
2741                 errors, &errorHandler,
2742                 "utf32", errmsg,
2743                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2744                 &unicode, &outpos, &p))
2745             goto onError;
2746     }
2747
2748     if (byteorder)
2749         *byteorder = bo;
2750
2751     if (consumed)
2752         *consumed = (const char *)q-starts;
2753
2754     /* Adjust length */
2755     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2756         goto onError;
2757
2758     Py_XDECREF(errorHandler);
2759     Py_XDECREF(exc);
2760     return (PyObject *)unicode;
2761
2762   onError:
2763     Py_DECREF(unicode);
2764     Py_XDECREF(errorHandler);
2765     Py_XDECREF(exc);
2766     return NULL;
2767 }
2768
2769 PyObject *
2770 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2771                       Py_ssize_t size,
2772                       const char *errors,
2773                       int byteorder)
2774 {
2775     PyObject *v;
2776     unsigned char *p;
2777     Py_ssize_t nsize, bytesize;
2778 #ifndef Py_UNICODE_WIDE
2779     Py_ssize_t i, pairs;
2780 #else
2781     const int pairs = 0;
2782 #endif
2783     /* Offsets from p for storing byte pairs in the right order. */
2784 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2785     int iorder[] = {0, 1, 2, 3};
2786 #else
2787     int iorder[] = {3, 2, 1, 0};
2788 #endif
2789
2790 #define STORECHAR(CH)                           \
2791     do {                                        \
2792         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2793         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2794         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2795         p[iorder[0]] = (CH) & 0xff;             \
2796         p += 4;                                 \
2797     } while(0)
2798
2799     /* In narrow builds we can output surrogate pairs as one codepoint,
2800        so we need less space. */
2801 #ifndef Py_UNICODE_WIDE
2802     for (i = pairs = 0; i < size-1; i++)
2803         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2804             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2805             pairs++;
2806 #endif
2807     nsize = (size - pairs + (byteorder == 0));
2808     bytesize = nsize * 4;
2809     if (bytesize / 4 != nsize)
2810         return PyErr_NoMemory();
2811     v = PyBytes_FromStringAndSize(NULL, bytesize);
2812     if (v == NULL)
2813         return NULL;
2814
2815     p = (unsigned char *)PyBytes_AS_STRING(v);
2816     if (byteorder == 0)
2817         STORECHAR(0xFEFF);
2818     if (size == 0)
2819         goto done;
2820
2821     if (byteorder == -1) {
2822         /* force LE */
2823         iorder[0] = 0;
2824         iorder[1] = 1;
2825         iorder[2] = 2;
2826         iorder[3] = 3;
2827     }
2828     else if (byteorder == 1) {
2829         /* force BE */
2830         iorder[0] = 3;
2831         iorder[1] = 2;
2832         iorder[2] = 1;
2833         iorder[3] = 0;
2834     }
2835
2836     while (size-- > 0) {
2837         Py_UCS4 ch = *s++;
2838 #ifndef Py_UNICODE_WIDE
2839         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2840             Py_UCS4 ch2 = *s;
2841             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2842                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2843                 s++;
2844                 size--;
2845             }
2846         }
2847 #endif
2848         STORECHAR(ch);
2849     }
2850
2851   done:
2852     return v;
2853 #undef STORECHAR
2854 }
2855
2856 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2857 {
2858     if (!PyUnicode_Check(unicode)) {
2859         PyErr_BadArgument();
2860         return NULL;
2861     }
2862     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2863                                  PyUnicode_GET_SIZE(unicode),
2864                                  NULL,
2865                                  0);
2866 }
2867
2868 /* --- UTF-16 Codec ------------------------------------------------------- */
2869
2870 PyObject *
2871 PyUnicode_DecodeUTF16(const char *s,
2872                       Py_ssize_t size,
2873                       const char *errors,
2874                       int *byteorder)
2875 {
2876     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2877 }
2878
2879 /* Two masks for fast checking of whether a C 'long' may contain
2880    UTF16-encoded surrogate characters. This is an efficient heuristic,
2881    assuming that non-surrogate characters with a code point >= 0x8000 are
2882    rare in most input.
2883    FAST_CHAR_MASK is used when the input is in native byte ordering,
2884    SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2885 */
2886 #if (SIZEOF_LONG == 8)
2887 # define FAST_CHAR_MASK         0x8000800080008000L
2888 # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2889 #elif (SIZEOF_LONG == 4)
2890 # define FAST_CHAR_MASK         0x80008000L
2891 # define SWAPPED_FAST_CHAR_MASK 0x00800080L
2892 #else
2893 # error C 'long' size should be either 4 or 8!
2894 #endif
2895
2896 PyObject *
2897 PyUnicode_DecodeUTF16Stateful(const char *s,
2898                               Py_ssize_t size,
2899                               const char *errors,
2900                               int *byteorder,
2901                               Py_ssize_t *consumed)
2902 {
2903     const char *starts = s;
2904     Py_ssize_t startinpos;
2905     Py_ssize_t endinpos;
2906     Py_ssize_t outpos;
2907     PyUnicodeObject *unicode;
2908     Py_UNICODE *p;
2909     const unsigned char *q, *e, *aligned_end;
2910     int bo = 0;       /* assume native ordering by default */
2911     int native_ordering = 0;
2912     const char *errmsg = "";
2913     /* Offsets from q for retrieving byte pairs in the right order. */
2914 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2915     int ihi = 1, ilo = 0;
2916 #else
2917     int ihi = 0, ilo = 1;
2918 #endif
2919     PyObject *errorHandler = NULL;
2920     PyObject *exc = NULL;
2921
2922     /* Note: size will always be longer than the resulting Unicode
2923        character count */
2924     unicode = _PyUnicode_New(size);
2925     if (!unicode)
2926         return NULL;
2927     if (size == 0)
2928         return (PyObject *)unicode;
2929
2930     /* Unpack UTF-16 encoded data */
2931     p = unicode->str;
2932     q = (unsigned char *)s;
2933     e = q + size - 1;
2934
2935     if (byteorder)
2936         bo = *byteorder;
2937
2938     /* Check for BOM marks (U+FEFF) in the input and adjust current
2939        byte order setting accordingly. In native mode, the leading BOM
2940        mark is skipped, in all other modes, it is copied to the output
2941        stream as-is (giving a ZWNBSP character). */
2942     if (bo == 0) {
2943         if (size >= 2) {
2944             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2945 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2946             if (bom == 0xFEFF) {
2947                 q += 2;
2948                 bo = -1;
2949             }
2950             else if (bom == 0xFFFE) {
2951                 q += 2;
2952                 bo = 1;
2953             }
2954 #else
2955             if (bom == 0xFEFF) {
2956                 q += 2;
2957                 bo = 1;
2958             }
2959             else if (bom == 0xFFFE) {
2960                 q += 2;
2961                 bo = -1;
2962             }
2963 #endif
2964         }
2965     }
2966
2967     if (bo == -1) {
2968         /* force LE */
2969         ihi = 1;
2970         ilo = 0;
2971     }
2972     else if (bo == 1) {
2973         /* force BE */
2974         ihi = 0;
2975         ilo = 1;
2976     }
2977 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2978     native_ordering = ilo < ihi;
2979 #else
2980     native_ordering = ilo > ihi;
2981 #endif
2982
2983     aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
2984     while (q < e) {
2985         Py_UNICODE ch;
2986         /* First check for possible aligned read of a C 'long'. Unaligned
2987            reads are more expensive, better to defer to another iteration. */
2988         if (!((size_t) q & LONG_PTR_MASK)) {
2989             /* Fast path for runs of non-surrogate chars. */
2990             register const unsigned char *_q = q;
2991             Py_UNICODE *_p = p;
2992             if (native_ordering) {
2993                 /* Native ordering is simple: as long as the input cannot
2994                    possibly contain a surrogate char, do an unrolled copy
2995                    of several 16-bit code points to the target object.
2996                    The non-surrogate check is done on several input bytes
2997                    at a time (as many as a C 'long' can contain). */
2998                 while (_q < aligned_end) {
2999                     unsigned long data = * (unsigned long *) _q;
3000                     if (data & FAST_CHAR_MASK)
3001                         break;
3002                     _p[0] = ((unsigned short *) _q)[0];
3003                     _p[1] = ((unsigned short *) _q)[1];
3004 #if (SIZEOF_LONG == 8)
3005                     _p[2] = ((unsigned short *) _q)[2];
3006                     _p[3] = ((unsigned short *) _q)[3];
3007 #endif
3008                     _q += SIZEOF_LONG;
3009                     _p += SIZEOF_LONG / 2;
3010                 }
3011             }
3012             else {
3013                 /* Byteswapped ordering is similar, but we must decompose
3014                    the copy bytewise, and take care of zero'ing out the
3015                    upper bytes if the target object is in 32-bit units
3016                    (that is, in UCS-4 builds). */
3017                 while (_q < aligned_end) {
3018                     unsigned long data = * (unsigned long *) _q;
3019                     if (data & SWAPPED_FAST_CHAR_MASK)
3020                         break;
3021                     /* Zero upper bytes in UCS-4 builds */
3022 #if (Py_UNICODE_SIZE > 2)
3023                     _p[0] = 0;
3024                     _p[1] = 0;
3025 #if (SIZEOF_LONG == 8)
3026                     _p[2] = 0;
3027                     _p[3] = 0;
3028 #endif
3029 #endif
3030                     /* Issue #4916; UCS-4 builds on big endian machines must
3031                        fill the two last bytes of each 4-byte unit. */
3032 #if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3033 # define OFF 2
3034 #else
3035 # define OFF 0
3036 #endif
3037                     ((unsigned char *) _p)[OFF + 1] = _q[0];
3038                     ((unsigned char *) _p)[OFF + 0] = _q[1];
3039                     ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3040                     ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3041 #if (SIZEOF_LONG == 8)
3042                     ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3043                     ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3044                     ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3045                     ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3046 #endif
3047 #undef OFF
3048                     _q += SIZEOF_LONG;
3049                     _p += SIZEOF_LONG / 2;
3050                 }
3051             }
3052             p = _p;
3053             q = _q;
3054             if (q >= e)
3055                 break;
3056         }
3057         ch = (q[ihi] << 8) | q[ilo];
3058
3059         q += 2;
3060
3061         if (ch < 0xD800 || ch > 0xDFFF) {
3062             *p++ = ch;
3063             continue;
3064         }
3065
3066         /* UTF-16 code pair: */
3067         if (q > e) {
3068             errmsg = "unexpected end of data";
3069             startinpos = (((const char *)q) - 2) - starts;
3070             endinpos = ((const char *)e) + 1 - starts;
3071             goto utf16Error;
3072         }
3073         if (0xD800 <= ch && ch <= 0xDBFF) {
3074             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3075             q += 2;
3076             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3077 #ifndef Py_UNICODE_WIDE
3078                 *p++ = ch;
3079                 *p++ = ch2;
3080 #else
3081                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3082 #endif
3083                 continue;
3084             }
3085             else {
3086                 errmsg = "illegal UTF-16 surrogate";
3087                 startinpos = (((const char *)q)-4)-starts;
3088                 endinpos = startinpos+2;
3089                 goto utf16Error;
3090             }
3091
3092         }
3093         errmsg = "illegal encoding";
3094         startinpos = (((const char *)q)-2)-starts;
3095         endinpos = startinpos+2;
3096         /* Fall through to report the error */
3097
3098       utf16Error:
3099         outpos = p - PyUnicode_AS_UNICODE(unicode);
3100         if (unicode_decode_call_errorhandler(
3101                 errors,
3102                 &errorHandler,
3103                 "utf16", errmsg,
3104                 &starts,
3105                 (const char **)&e,
3106                 &startinpos,
3107                 &endinpos,
3108                 &exc,
3109                 (const char **)&q,
3110                 &unicode,
3111                 &outpos,
3112                 &p))
3113             goto onError;
3114     }
3115     /* remaining byte at the end? (size should be even) */
3116     if (e == q) {
3117         if (!consumed) {
3118             errmsg = "truncated data";
3119             startinpos = ((const char *)q) - starts;
3120             endinpos = ((const char *)e) + 1 - starts;
3121             outpos = p - PyUnicode_AS_UNICODE(unicode);
3122             if (unicode_decode_call_errorhandler(
3123                     errors,
3124                     &errorHandler,
3125                     "utf16", errmsg,
3126                     &starts,
3127                     (const char **)&e,
3128                     &startinpos,
3129                     &endinpos,
3130                     &exc,
3131                     (const char **)&q,
3132                     &unicode,
3133                     &outpos,
3134                     &p))
3135                 goto onError;
3136             /* The remaining input chars are ignored if the callback
3137                chooses to skip the input */
3138         }
3139     }
3140
3141     if (byteorder)
3142         *byteorder = bo;
3143
3144     if (consumed)
3145         *consumed = (const char *)q-starts;
3146
3147     /* Adjust length */
3148     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3149         goto onError;
3150
3151     Py_XDECREF(errorHandler);
3152     Py_XDECREF(exc);
3153     return (PyObject *)unicode;
3154
3155   onError:
3156     Py_DECREF(unicode);
3157     Py_XDECREF(errorHandler);
3158     Py_XDECREF(exc);
3159     return NULL;
3160 }
3161
3162 #undef FAST_CHAR_MASK
3163 #undef SWAPPED_FAST_CHAR_MASK
3164
3165 PyObject *
3166 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3167                       Py_ssize_t size,
3168                       const char *errors,
3169                       int byteorder)
3170 {
3171     PyObject *v;
3172     unsigned char *p;
3173     Py_ssize_t nsize, bytesize;
3174 #ifdef Py_UNICODE_WIDE
3175     Py_ssize_t i, pairs;
3176 #else
3177     const int pairs = 0;
3178 #endif
3179     /* Offsets from p for storing byte pairs in the right order. */
3180 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
3181     int ihi = 1, ilo = 0;
3182 #else
3183     int ihi = 0, ilo = 1;
3184 #endif
3185
3186 #define STORECHAR(CH)                           \
3187     do {                                        \
3188         p[ihi] = ((CH) >> 8) & 0xff;            \
3189         p[ilo] = (CH) & 0xff;                   \
3190         p += 2;                                 \
3191     } while(0)
3192
3193 #ifdef Py_UNICODE_WIDE
3194     for (i = pairs = 0; i < size; i++)
3195         if (s[i] >= 0x10000)
3196             pairs++;
3197 #endif
3198     /* 2 * (size + pairs + (byteorder == 0)) */
3199     if (size > PY_SSIZE_T_MAX ||
3200         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3201         return PyErr_NoMemory();
3202     nsize = size + pairs + (byteorder == 0);
3203     bytesize = nsize * 2;
3204     if (bytesize / 2 != nsize)
3205         return PyErr_NoMemory();
3206     v = PyBytes_FromStringAndSize(NULL, bytesize);
3207     if (v == NULL)
3208         return NULL;
3209
3210     p = (unsigned char *)PyBytes_AS_STRING(v);
3211     if (byteorder == 0)
3212         STORECHAR(0xFEFF);
3213     if (size == 0)
3214         goto done;
3215
3216     if (byteorder == -1) {
3217         /* force LE */
3218         ihi = 1;
3219         ilo = 0;
3220     }
3221     else if (byteorder == 1) {
3222         /* force BE */
3223         ihi = 0;
3224         ilo = 1;
3225     }
3226
3227     while (size-- > 0) {
3228         Py_UNICODE ch = *s++;
3229         Py_UNICODE ch2 = 0;
3230 #ifdef Py_UNICODE_WIDE
3231         if (ch >= 0x10000) {
3232             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3233             ch  = 0xD800 | ((ch-0x10000) >> 10);
3234         }
3235 #endif
3236         STORECHAR(ch);
3237         if (ch2)
3238             STORECHAR(ch2);
3239     }
3240
3241   done:
3242     return v;
3243 #undef STORECHAR
3244 }
3245
3246 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3247 {
3248     if (!PyUnicode_Check(unicode)) {
3249         PyErr_BadArgument();
3250         return NULL;
3251     }
3252     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3253                                  PyUnicode_GET_SIZE(unicode),
3254                                  NULL,
3255                                  0);
3256 }
3257
3258 /* --- Unicode Escape Codec ----------------------------------------------- */
3259
3260 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3261
3262 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3263                                         Py_ssize_t size,
3264                                         const char *errors)
3265 {
3266     const char *starts = s;
3267     Py_ssize_t startinpos;
3268     Py_ssize_t endinpos;
3269     Py_ssize_t outpos;
3270     int i;
3271     PyUnicodeObject *v;
3272     Py_UNICODE *p;
3273     const char *end;
3274     char* message;
3275     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3276     PyObject *errorHandler = NULL;
3277     PyObject *exc = NULL;
3278
3279     /* Escaped strings will always be longer than the resulting
3280        Unicode string, so we start with size here and then reduce the
3281        length after conversion to the true value.
3282        (but if the error callback returns a long replacement string
3283        we'll have to allocate more space) */
3284     v = _PyUnicode_New(size);
3285     if (v == NULL)
3286         goto onError;
3287     if (size == 0)
3288         return (PyObject *)v;
3289
3290     p = PyUnicode_AS_UNICODE(v);
3291     end = s + size;
3292
3293     while (s < end) {
3294         unsigned char c;
3295         Py_UNICODE x;
3296         int digits;
3297
3298         /* Non-escape characters are interpreted as Unicode ordinals */
3299         if (*s != '\\') {
3300             *p++ = (unsigned char) *s++;
3301             continue;
3302         }
3303
3304         startinpos = s-starts;
3305         /* \ - Escapes */
3306         s++;
3307         c = *s++;
3308         if (s > end)
3309             c = '\0'; /* Invalid after \ */
3310         switch (c) {
3311
3312             /* \x escapes */
3313         case '\n': break;
3314         case '\\': *p++ = '\\'; break;
3315         case '\'': *p++ = '\''; break;
3316         case '\"': *p++ = '\"'; break;
3317         case 'b': *p++ = '\b'; break;
3318         case 'f': *p++ = '\014'; break; /* FF */
3319         case 't': *p++ = '\t'; break;
3320         case 'n': *p++ = '\n'; break;
3321         case 'r': *p++ = '\r'; break;
3322         case 'v': *p++ = '\013'; break; /* VT */
3323         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3324
3325             /* \OOO (octal) escapes */
3326         case '0': case '1': case '2': case '3':
3327         case '4': case '5': case '6': case '7':
3328             x = s[-1] - '0';
3329             if (s < end && '0' <= *s && *s <= '7') {
3330                 x = (x<<3) + *s++ - '0';
3331                 if (s < end && '0' <= *s && *s <= '7')
3332                     x = (x<<3) + *s++ - '0';
3333             }
3334             *p++ = x;
3335             break;
3336
3337             /* hex escapes */
3338             /* \xXX */
3339         case 'x':
3340             digits = 2;
3341             message = "truncated \\xXX escape";
3342             goto hexescape;
3343
3344             /* \uXXXX */
3345         case 'u':
3346             digits = 4;
3347             message = "truncated \\uXXXX escape";
3348             goto hexescape;
3349
3350             /* \UXXXXXXXX */
3351         case 'U':
3352             digits = 8;
3353             message = "truncated \\UXXXXXXXX escape";
3354         hexescape:
3355             chr = 0;
3356             outpos = p-PyUnicode_AS_UNICODE(v);
3357             if (s+digits>end) {
3358                 endinpos = size;
3359                 if (unicode_decode_call_errorhandler(
3360                         errors, &errorHandler,
3361                         "unicodeescape", "end of string in escape sequence",
3362                         &starts, &end, &startinpos, &endinpos, &exc, &s,
3363                         &v, &outpos, &p))
3364                     goto onError;
3365                 goto nextByte;
3366             }
3367             for (i = 0; i < digits; ++i) {
3368                 c = (unsigned char) s[i];
3369                 if (!ISXDIGIT(c)) {
3370                     endinpos = (s+i+1)-starts;
3371                     if (unicode_decode_call_errorhandler(
3372                             errors, &errorHandler,
3373                             "unicodeescape", message,
3374                             &starts, &end, &startinpos, &endinpos, &exc, &s,
3375                             &v, &outpos, &p))
3376                         goto onError;
3377                     goto nextByte;
3378                 }
3379                 chr = (chr<<4) & ~0xF;
3380                 if (c >= '0' && c <= '9')
3381                     chr += c - '0';
3382                 else if (c >= 'a' && c <= 'f')
3383                     chr += 10 + c - 'a';
3384                 else
3385                     chr += 10 + c - 'A';
3386             }
3387             s += i;
3388             if (chr == 0xffffffff && PyErr_Occurred())
3389                 /* _decoding_error will have already written into the
3390                    target buffer. */
3391                 break;
3392         store:
3393             /* when we get here, chr is a 32-bit unicode character */
3394             if (chr <= 0xffff)
3395                 /* UCS-2 character */
3396                 *p++ = (Py_UNICODE) chr;
3397             else if (chr <= 0x10ffff) {
3398                 /* UCS-4 character. Either store directly, or as
3399                    surrogate pair. */
3400 #ifdef Py_UNICODE_WIDE
3401                 *p++ = chr;
3402 #else
3403                 chr -= 0x10000L;
3404                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3405                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3406 #endif
3407             } else {
3408                 endinpos = s-starts;
3409                 outpos = p-PyUnicode_AS_UNICODE(v);
3410                 if (unicode_decode_call_errorhandler(
3411                         errors, &errorHandler,
3412                         "unicodeescape", "illegal Unicode character",
3413                         &starts, &end, &startinpos, &endinpos, &exc, &s,
3414                         &v, &outpos, &p))
3415                     goto onError;
3416             }
3417             break;
3418
3419             /* \N{name} */
3420         case 'N':
3421             message = "malformed \\N character escape";
3422             if (ucnhash_CAPI == NULL) {
3423                 /* load the unicode data module */
3424                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3425                 if (ucnhash_CAPI == NULL)
3426                     goto ucnhashError;
3427             }
3428             if (*s == '{') {
3429                 const char *start = s+1;
3430                 /* look for the closing brace */
3431                 while (*s != '}' && s < end)
3432                     s++;
3433                 if (s > start && s < end && *s == '}') {
3434                     /* found a name.  look it up in the unicode database */
3435                     message = "unknown Unicode character name";
3436                     s++;
3437                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3438                         goto store;
3439                 }
3440             }
3441             endinpos = s-starts;
3442             outpos = p-PyUnicode_AS_UNICODE(v);
3443             if (unicode_decode_call_errorhandler(
3444                     errors, &errorHandler,
3445                     "unicodeescape", message,
3446                     &starts, &end, &startinpos, &endinpos, &exc, &s,
3447                     &v, &outpos, &p))
3448                 goto onError;
3449             break;
3450
3451         default:
3452             if (s > end) {
3453                 message = "\\ at end of string";
3454                 s--;
3455                 endinpos = s-starts;
3456                 outpos = p-PyUnicode_AS_UNICODE(v);
3457                 if (unicode_decode_call_errorhandler(
3458                         errors, &errorHandler,
3459                         "unicodeescape", message,
3460                         &starts, &end, &startinpos, &endinpos, &exc, &s,
3461                         &v, &outpos, &p))
3462                     goto onError;
3463             }
3464             else {
3465                 *p++ = '\\';
3466                 *p++ = (unsigned char)s[-1];
3467             }
3468             break;
3469         }
3470       nextByte:
3471         ;
3472     }
3473     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3474         goto onError;
3475     Py_XDECREF(errorHandler);
3476     Py_XDECREF(exc);
3477     return (PyObject *)v;
3478
3479   ucnhashError:
3480     PyErr_SetString(
3481         PyExc_UnicodeError,
3482         "\\N escapes not supported (can't load unicodedata module)"
3483         );
3484     Py_XDECREF(v);
3485     Py_XDECREF(errorHandler);
3486     Py_XDECREF(exc);
3487     return NULL;
3488
3489   onError:
3490     Py_XDECREF(v);
3491     Py_XDECREF(errorHandler);
3492     Py_XDECREF(exc);
3493     return NULL;
3494 }
3495
3496 /* Return a Unicode-Escape string version of the Unicode object.
3497
3498    If quotes is true, the string is enclosed in u"" or u'' quotes as
3499    appropriate.
3500
3501 */
3502
3503 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3504                                              Py_ssize_t size,
3505                                              Py_UNICODE ch)
3506 {
3507     /* like wcschr, but doesn't stop at NULL characters */
3508
3509     while (size-- > 0) {
3510         if (*s == ch)
3511             return s;
3512         s++;
3513     }
3514
3515     return NULL;
3516 }
3517
3518 static const char *hexdigits = "0123456789abcdef";
3519
3520 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3521                                         Py_ssize_t size)
3522 {
3523     PyObject *repr;
3524     char *p;
3525
3526 #ifdef Py_UNICODE_WIDE
3527     const Py_ssize_t expandsize = 10;
3528 #else
3529     const Py_ssize_t expandsize = 6;
3530 #endif
3531
3532     /* XXX(nnorwitz): rather than over-allocating, it would be
3533        better to choose a different scheme.  Perhaps scan the
3534        first N-chars of the string and allocate based on that size.
3535     */
3536     /* Initial allocation is based on the longest-possible unichr
3537        escape.
3538
3539        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3540        unichr, so in this case it's the longest unichr escape. In
3541        narrow (UTF-16) builds this is five chars per source unichr
3542        since there are two unichrs in the surrogate pair, so in narrow
3543        (UTF-16) builds it's not the longest unichr escape.
3544
3545        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3546        so in the narrow (UTF-16) build case it's the longest unichr
3547        escape.
3548     */
3549
3550     if (size == 0)
3551         return PyBytes_FromStringAndSize(NULL, 0);
3552
3553     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3554         return PyErr_NoMemory();
3555
3556     repr = PyBytes_FromStringAndSize(NULL,
3557                                      2
3558                                      + expandsize*size
3559                                      + 1);
3560     if (repr == NULL)
3561         return NULL;
3562
3563     p = PyBytes_AS_STRING(repr);
3564
3565     while (size-- > 0) {
3566         Py_UNICODE ch = *s++;
3567
3568         /* Escape backslashes */
3569         if (ch == '\\') {
3570             *p++ = '\\';
3571             *p++ = (char) ch;
3572             continue;
3573         }
3574
3575 #ifdef Py_UNICODE_WIDE
3576         /* Map 21-bit characters to '\U00xxxxxx' */
3577         else if (ch >= 0x10000) {
3578             *p++ = '\\';
3579             *p++ = 'U';
3580             *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3581             *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3582             *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3583             *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3584             *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3585             *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3586             *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3587             *p++ = hexdigits[ch & 0x0000000F];
3588             continue;
3589         }
3590 #else
3591         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3592         else if (ch >= 0xD800 && ch < 0xDC00) {
3593             Py_UNICODE ch2;
3594             Py_UCS4 ucs;
3595
3596             ch2 = *s++;
3597             size--;
3598             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3599                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3600                 *p++ = '\\';
3601                 *p++ = 'U';
3602                 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3603                 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3604                 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3605                 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3606                 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3607                 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3608                 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3609                 *p++ = hexdigits[ucs & 0x0000000F];
3610                 continue;
3611             }
3612             /* Fall through: isolated surrogates are copied as-is */
3613             s--;
3614             size++;
3615         }
3616 #endif
3617
3618         /* Map 16-bit characters to '\uxxxx' */
3619         if (ch >= 256) {
3620             *p++ = '\\';
3621             *p++ = 'u';
3622             *p++ = hexdigits[(ch >> 12) & 0x000F];
3623             *p++ = hexdigits[(ch >> 8) & 0x000F];
3624             *p++ = hexdigits[(ch >> 4) & 0x000F];
3625             *p++ = hexdigits[ch & 0x000F];
3626         }
3627
3628         /* Map special whitespace to '\t', \n', '\r' */
3629         else if (ch == '\t') {
3630             *p++ = '\\';
3631             *p++ = 't';
3632         }
3633         else if (ch == '\n') {
3634             *p++ = '\\';
3635             *p++ = 'n';
3636         }
3637         else if (ch == '\r') {
3638             *p++ = '\\';
3639             *p++ = 'r';
3640         }
3641
3642         /* Map non-printable US ASCII to '\xhh' */
3643         else if (ch < ' ' || ch >= 0x7F) {
3644             *p++ = '\\';
3645             *p++ = 'x';
3646             *p++ = hexdigits[(ch >> 4) & 0x000F];
3647             *p++ = hexdigits[ch & 0x000F];
3648         }
3649
3650         /* Copy everything else as-is */
3651         else
3652             *p++ = (char) ch;
3653     }
3654
3655     assert(p - PyBytes_AS_STRING(repr) > 0);
3656     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3657         return NULL;
3658     return repr;
3659 }
3660
3661 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3662 {
3663     PyObject *s;
3664     if (!PyUnicode_Check(unicode)) {
3665         PyErr_BadArgument();
3666         return NULL;
3667     }
3668     s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3669                                       PyUnicode_GET_SIZE(unicode));
3670     return s;
3671 }
3672
3673 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3674
3675 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3676                                            Py_ssize_t size,
3677                                            const char *errors)
3678 {
3679     const char *starts = s;
3680     Py_ssize_t startinpos;
3681     Py_ssize_t endinpos;
3682     Py_ssize_t outpos;
3683     PyUnicodeObject *v;
3684     Py_UNICODE *p;
3685     const char *end;
3686     const char *bs;
3687     PyObject *errorHandler = NULL;
3688     PyObject *exc = NULL;
3689
3690     /* Escaped strings will always be longer than the resulting
3691        Unicode string, so we start with size here and then reduce the
3692        length after conversion to the true value. (But decoding error
3693        handler might have to resize the string) */
3694     v = _PyUnicode_New(size);
3695     if (v == NULL)
3696         goto onError;
3697     if (size == 0)
3698         return (PyObject *)v;
3699     p = PyUnicode_AS_UNICODE(v);
3700     end = s + size;
3701     while (s < end) {
3702         unsigned char c;
3703         Py_UCS4 x;
3704         int i;
3705         int count;
3706
3707         /* Non-escape characters are interpreted as Unicode ordinals */
3708         if (*s != '\\') {
3709             *p++ = (unsigned char)*s++;
3710             continue;
3711         }
3712         startinpos = s-starts;
3713
3714         /* \u-escapes are only interpreted iff the number of leading
3715            backslashes if odd */
3716         bs = s;
3717         for (;s < end;) {
3718             if (*s != '\\')
3719                 break;
3720             *p++ = (unsigned char)*s++;
3721         }
3722         if (((s - bs) & 1) == 0 ||
3723             s >= end ||
3724             (*s != 'u' && *s != 'U')) {
3725             continue;
3726         }
3727         p--;
3728         count = *s=='u' ? 4 : 8;
3729         s++;
3730
3731         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3732         outpos = p-PyUnicode_AS_UNICODE(v);
3733         for (x = 0, i = 0; i < count; ++i, ++s) {
3734             c = (unsigned char)*s;
3735             if (!ISXDIGIT(c)) {
3736                 endinpos = s-starts;
3737                 if (unicode_decode_call_errorhandler(
3738                         errors, &errorHandler,
3739                         "rawunicodeescape", "truncated \\uXXXX",
3740                         &starts, &end, &startinpos, &endinpos, &exc, &s,
3741                         &v, &outpos, &p))
3742                     goto onError;
3743                 goto nextByte;
3744             }
3745             x = (x<<4) & ~0xF;
3746             if (c >= '0' && c <= '9')
3747                 x += c - '0';
3748             else if (c >= 'a' && c <= 'f')
3749                 x += 10 + c - 'a';
3750             else
3751                 x += 10 + c - 'A';
3752         }
3753         if (x <= 0xffff)
3754             /* UCS-2 character */
3755             *p++ = (Py_UNICODE) x;
3756         else if (x <= 0x10ffff) {
3757             /* UCS-4 character. Either store directly, or as
3758                surrogate pair. */
3759 #ifdef Py_UNICODE_WIDE
3760             *p++ = (Py_UNICODE) x;
3761 #else
3762             x -= 0x10000L;
3763             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3764             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3765 #endif
3766         } else {
3767             endinpos = s-starts;
3768             outpos = p-PyUnicode_AS_UNICODE(v);
3769             if (unicode_decode_call_errorhandler(
3770                     errors, &errorHandler,
3771                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3772                     &starts, &end, &startinpos, &endinpos, &exc, &s,
3773                     &v, &outpos, &p))
3774                 goto onError;
3775         }
3776       nextByte:
3777         ;
3778     }
3779     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3780         goto onError;
3781     Py_XDECREF(errorHandler);
3782     Py_XDECREF(exc);
3783     return (PyObject *)v;
3784
3785   onError:
3786     Py_XDECREF(v);
3787     Py_XDECREF(errorHandler);
3788     Py_XDECREF(exc);
3789     return NULL;
3790 }
3791
3792 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3793                                            Py_ssize_t size)
3794 {
3795     PyObject *repr;
3796     char *p;
3797     char *q;
3798
3799 #ifdef Py_UNICODE_WIDE
3800     const Py_ssize_t expandsize = 10;
3801 #else
3802     const Py_ssize_t expandsize = 6;
3803 #endif
3804
3805     if (size > PY_SSIZE_T_MAX / expandsize)
3806         return PyErr_NoMemory();
3807
3808     repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
3809     if (repr == NULL)
3810         return NULL;
3811     if (size == 0)
3812         return repr;
3813
3814     p = q = PyBytes_AS_STRING(repr);
3815     while (size-- > 0) {
3816         Py_UNICODE ch = *s++;
3817 #ifdef Py_UNICODE_WIDE
3818         /* Map 32-bit characters to '\Uxxxxxxxx' */
3819         if (ch >= 0x10000) {
3820             *p++ = '\\';
3821             *p++ = 'U';
3822             *p++ = hexdigits[(ch >> 28) & 0xf];
3823             *p++ = hexdigits[(ch >> 24) & 0xf];
3824             *p++ = hexdigits[(ch >> 20) & 0xf];
3825             *p++ = hexdigits[(ch >> 16) & 0xf];
3826             *p++ = hexdigits[(ch >> 12) & 0xf];
3827             *p++ = hexdigits[(ch >> 8) & 0xf];
3828             *p++ = hexdigits[(ch >> 4) & 0xf];
3829             *p++ = hexdigits[ch & 15];
3830         }
3831         else
3832 #else
3833             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3834             if (ch >= 0xD800 && ch < 0xDC00) {
3835                 Py_UNICODE ch2;
3836                 Py_UCS4 ucs;
3837
3838                 ch2 = *s++;
3839                 size--;
3840                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3841                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3842                     *p++ = '\\';
3843                     *p++ = 'U';
3844                     *p++ = hexdigits[(ucs >> 28) & 0xf];
3845                     *p++ = hexdigits[(ucs >> 24) & 0xf];
3846                     *p++ = hexdigits[(ucs >> 20) & 0xf];
3847                     *p++ = hexdigits[(ucs >> 16) & 0xf];
3848                     *p++ = hexdigits[(ucs >> 12) & 0xf];
3849                     *p++ = hexdigits[(ucs >> 8) & 0xf];
3850                     *p++ = hexdigits[(ucs >> 4) & 0xf];
3851                     *p++ = hexdigits[ucs & 0xf];
3852                     continue;
3853                 }
3854                 /* Fall through: isolated surrogates are copied as-is */
3855                 s--;
3856                 size++;
3857             }
3858 #endif
3859         /* Map 16-bit characters to '\uxxxx' */
3860         if (ch >= 256) {
3861             *p++ = '\\';
3862             *p++ = 'u';
3863             *p++ = hexdigits[(ch >> 12) & 0xf];
3864             *p++ = hexdigits[(ch >> 8) & 0xf];
3865             *p++ = hexdigits[(ch >> 4) & 0xf];
3866             *p++ = hexdigits[ch & 15];
3867         }
3868         /* Copy everything else as-is */
3869         else
3870             *p++ = (char) ch;
3871     }
3872     size = p - q;
3873
3874     assert(size > 0);
3875     if (_PyBytes_Resize(&repr, size) < 0)
3876         return NULL;
3877     return repr;
3878 }
3879
3880 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3881 {
3882     PyObject *s;
3883     if (!PyUnicode_Check(unicode)) {
3884         PyErr_BadArgument();
3885         return NULL;
3886     }
3887     s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3888                                          PyUnicode_GET_SIZE(unicode));
3889
3890     return s;
3891 }
3892
3893 /* --- Unicode Internal Codec ------------------------------------------- */
3894
3895 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3896                                            Py_ssize_t size,
3897                                            const char *errors)
3898 {
3899     const char *starts = s;
3900     Py_ssize_t startinpos;
3901     Py_ssize_t endinpos;
3902     Py_ssize_t outpos;
3903     PyUnicodeObject *v;
3904     Py_UNICODE *p;
3905     const char *end;
3906     const char *reason;
3907     PyObject *errorHandler = NULL;
3908     PyObject *exc = NULL;
3909
3910 #ifdef Py_UNICODE_WIDE
3911     Py_UNICODE unimax = PyUnicode_GetMax();
3912 #endif
3913
3914     /* XXX overflow detection missing */
3915     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3916     if (v == NULL)
3917         goto onError;
3918     if (PyUnicode_GetSize((PyObject *)v) == 0)
3919         return (PyObject *)v;
3920     p = PyUnicode_AS_UNICODE(v);
3921     end = s + size;
3922
3923     while (s < end) {
3924         memcpy(p, s, sizeof(Py_UNICODE));
3925         /* We have to sanity check the raw data, otherwise doom looms for
3926            some malformed UCS-4 data. */
3927         if (
3928 #ifdef Py_UNICODE_WIDE
3929             *p > unimax || *p < 0 ||
3930 #endif
3931             end-s < Py_UNICODE_SIZE
3932             )
3933         {
3934             startinpos = s - starts;
3935             if (end-s < Py_UNICODE_SIZE) {
3936                 endinpos = end-starts;
3937                 reason = "truncated input";
3938             }
3939             else {
3940                 endinpos = s - starts + Py_UNICODE_SIZE;
3941                 reason = "illegal code point (> 0x10FFFF)";
3942             }
3943             outpos = p - PyUnicode_AS_UNICODE(v);
3944             if (unicode_decode_call_errorhandler(
3945                     errors, &errorHandler,
3946                     "unicode_internal", reason,
3947                     &starts, &end, &startinpos, &endinpos, &exc, &s,
3948                     &v, &outpos, &p)) {
3949                 goto onError;
3950             }
3951         }
3952         else {
3953             p++;
3954             s += Py_UNICODE_SIZE;
3955         }
3956     }
3957
3958     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3959         goto onError;
3960     Py_XDECREF(errorHandler);
3961     Py_XDECREF(exc);
3962     return (PyObject *)v;
3963
3964   onError:
3965     Py_XDECREF(v);
3966     Py_XDECREF(errorHandler);
3967     Py_XDECREF(exc);
3968     return NULL;
3969 }
3970
3971 /* --- Latin-1 Codec ------------------------------------------------------ */
3972
3973 PyObject *PyUnicode_DecodeLatin1(const char *s,
3974                                  Py_ssize_t size,
3975                                  const char *errors)
3976 {
3977     PyUnicodeObject *v;
3978     Py_UNICODE *p;
3979     const char *e, *unrolled_end;
3980
3981     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3982     if (size == 1) {
3983         Py_UNICODE r = *(unsigned char*)s;
3984         return PyUnicode_FromUnicode(&r, 1);
3985     }
3986
3987     v = _PyUnicode_New(size);
3988     if (v == NULL)
3989         goto onError;
3990     if (size == 0)
3991         return (PyObject *)v;
3992     p = PyUnicode_AS_UNICODE(v);
3993     e = s + size;
3994     /* Unrolling the copy makes it much faster by reducing the looping
3995        overhead. This is similar to what many memcpy() implementations do. */
3996     unrolled_end = e - 4;
3997     while (s < unrolled_end) {
3998         p[0] = (unsigned char) s[0];
3999         p[1] = (unsigned char) s[1];
4000         p[2] = (unsigned char) s[2];
4001         p[3] = (unsigned char) s[3];
4002         s += 4;
4003         p += 4;
4004     }
4005     while (s < e)
4006         *p++ = (unsigned char) *s++;
4007     return (PyObject *)v;
4008
4009   onError:
4010     Py_XDECREF(v);
4011     return NULL;
4012 }
4013
4014 /* create or adjust a UnicodeEncodeError */
4015 static void make_encode_exception(PyObject **exceptionObject,
4016                                   const char *encoding,
4017                                   const Py_UNICODE *unicode, Py_ssize_t size,
4018                                   Py_ssize_t startpos, Py_ssize_t endpos,
4019                                   const char *reason)
4020 {
4021     if (*exceptionObject == NULL) {
4022         *exceptionObject = PyUnicodeEncodeError_Create(
4023             encoding, unicode, size, startpos, endpos, reason);
4024     }
4025     else {
4026         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4027             goto onError;
4028         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4029             goto onError;
4030         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4031             goto onError;
4032         return;
4033       onError:
4034         Py_DECREF(*exceptionObject);
4035         *exceptionObject = NULL;
4036     }
4037 }
4038
4039 /* raises a UnicodeEncodeError */
4040 static void raise_encode_exception(PyObject **exceptionObject,
4041                                    const char *encoding,
4042                                    const Py_UNICODE *unicode, Py_ssize_t size,
4043                                    Py_ssize_t startpos, Py_ssize_t endpos,
4044                                    const char *reason)
4045 {
4046     make_encode_exception(exceptionObject,
4047                           encoding, unicode, size, startpos, endpos, reason);
4048     if (*exceptionObject != NULL)
4049         PyCodec_StrictErrors(*exceptionObject);
4050 }
4051
4052 /* error handling callback helper:
4053    build arguments, call the callback and check the arguments,
4054    put the result into newpos and return the replacement string, which
4055    has to be freed by the caller */
4056 static PyObject *unicode_encode_call_errorhandler(const char *errors,
4057                                                   PyObject **errorHandler,
4058                                                   const char *encoding, const char *reason,
4059                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4060                                                   Py_ssize_t startpos, Py_ssize_t endpos,
4061                                                   Py_ssize_t *newpos)
4062 {
4063     static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4064
4065     PyObject *restuple;
4066     PyObject *resunicode;
4067
4068     if (*errorHandler == NULL) {
4069         *errorHandler = PyCodec_LookupError(errors);
4070         if (*errorHandler == NULL)
4071             return NULL;
4072     }
4073
4074     make_encode_exception(exceptionObject,
4075                           encoding, unicode, size, startpos, endpos, reason);
4076     if (*exceptionObject == NULL)
4077         return NULL;
4078
4079     restuple = PyObject_CallFunctionObjArgs(
4080         *errorHandler, *exceptionObject, NULL);
4081     if (restuple == NULL)
4082         return NULL;
4083     if (!PyTuple_Check(restuple)) {
4084         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4085         Py_DECREF(restuple);
4086         return NULL;
4087     }
4088     if (!PyArg_ParseTuple(restuple, argparse,
4089                           &resunicode, newpos)) {
4090         Py_DECREF(restuple);
4091         return NULL;
4092     }
4093     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4094         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4095         Py_DECREF(restuple);
4096         return NULL;
4097     }
4098     if (*newpos<0)
4099         *newpos = size+*newpos;
4100     if (*newpos<0 || *newpos>size) {
4101         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4102         Py_DECREF(restuple);
4103         return NULL;
4104     }
4105     Py_INCREF(resunicode);
4106     Py_DECREF(restuple);
4107     return resunicode;
4108 }
4109
4110 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4111                                      Py_ssize_t size,
4112                                      const char *errors,
4113                                      int limit)
4114 {
4115     /* output object */
4116     PyObject *res;
4117     /* pointers to the beginning and end+1 of input */
4118     const Py_UNICODE *startp = p;
4119     const Py_UNICODE *endp = p + size;
4120     /* pointer to the beginning of the unencodable characters */
4121     /* const Py_UNICODE *badp = NULL; */
4122     /* pointer into the output */
4123     char *str;
4124     /* current output position */
4125     Py_ssize_t ressize;
4126     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4127     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4128     PyObject *errorHandler = NULL;
4129     PyObject *exc = NULL;
4130     /* the following variable is used for caching string comparisons
4131      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132     int known_errorHandler = -1;
4133
4134     /* allocate enough for a simple encoding without
4135        replacements, if we need more, we'll resize */
4136     if (size == 0)
4137         return PyBytes_FromStringAndSize(NULL, 0);
4138     res = PyBytes_FromStringAndSize(NULL, size);
4139     if (res == NULL)
4140         return NULL;
4141     str = PyBytes_AS_STRING(res);
4142     ressize = size;
4143
4144     while (p<endp) {
4145         Py_UNICODE c = *p;
4146
4147         /* can we encode this? */
4148         if (c<limit) {
4149             /* no overflow check, because we know that the space is enough */
4150             *str++ = (char)c;
4151             ++p;
4152         }
4153         else {
4154             Py_ssize_t unicodepos = p-startp;
4155             Py_ssize_t requiredsize;
4156             PyObject *repunicode;
4157             Py_ssize_t repsize;
4158             Py_ssize_t newpos;
4159             Py_ssize_t respos;
4160             Py_UNICODE *uni2;
4161             /* startpos for collecting unencodable chars */
4162             const Py_UNICODE *collstart = p;
4163             const Py_UNICODE *collend = p;
4164             /* find all unecodable characters */
4165             while ((collend < endp) && ((*collend)>=limit))
4166                 ++collend;
4167             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4168             if (known_errorHandler==-1) {
4169                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4170                     known_errorHandler = 1;
4171                 else if (!strcmp(errors, "replace"))
4172                     known_errorHandler = 2;
4173                 else if (!strcmp(errors, "ignore"))
4174                     known_errorHandler = 3;
4175                 else if (!strcmp(errors, "xmlcharrefreplace"))
4176                     known_errorHandler = 4;
4177                 else
4178                     known_errorHandler = 0;
4179             }
4180             switch (known_errorHandler) {
4181             case 1: /* strict */
4182                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4183                 goto onError;
4184             case 2: /* replace */
4185                 while (collstart++<collend)
4186                     *str++ = '?'; /* fall through */
4187             case 3: /* ignore */
4188                 p = collend;
4189                 break;
4190             case 4: /* xmlcharrefreplace */
4191                 respos = str - PyBytes_AS_STRING(res);
4192                 /* determine replacement size (temporarily (mis)uses p) */
4193                 for (p = collstart, repsize = 0; p < collend; ++p) {
4194                     if (*p<10)
4195                         repsize += 2+1+1;
4196                     else if (*p<100)
4197                         repsize += 2+2+1;
4198                     else if (*p<1000)
4199                         repsize += 2+3+1;
4200                     else if (*p<10000)
4201                         repsize += 2+4+1;
4202 #ifndef Py_UNICODE_WIDE
4203                     else
4204                         repsize += 2+5+1;
4205 #else
4206                     else if (*p<100000)
4207                         repsize += 2+5+1;
4208                     else if (*p<1000000)
4209                         repsize += 2+6+1;
4210                     else
4211                         repsize += 2+7+1;
4212 #endif
4213                 }
4214                 requiredsize = respos+repsize+(endp-collend);
4215                 if (requiredsize > ressize) {
4216                     if (requiredsize<2*ressize)
4217                         requiredsize = 2*ressize;
4218                     if (_PyBytes_Resize(&res, requiredsize))
4219                         goto onError;
4220                     str = PyBytes_AS_STRING(res) + respos;
4221                     ressize = requiredsize;
4222                 }
4223                 /* generate replacement (temporarily (mis)uses p) */
4224                 for (p = collstart; p < collend; ++p) {
4225                     str += sprintf(str, "&#%d;", (int)*p);
4226                 }
4227                 p = collend;
4228                 break;
4229             default:
4230                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4231                                                               encoding, reason, startp, size, &exc,
4232                                                               collstart-startp, collend-startp, &newpos);
4233                 if (repunicode == NULL)
4234                     goto onError;
4235                 if (PyBytes_Check(repunicode)) {
4236                     /* Directly copy bytes result to output. */
4237                     repsize = PyBytes_Size(repunicode);
4238                     if (repsize > 1) {
4239                         /* Make room for all additional bytes. */
4240                         respos = str - PyBytes_AS_STRING(res);
4241                         if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4242                             Py_DECREF(repunicode);
4243                             goto onError;
4244                         }
4245                         str = PyBytes_AS_STRING(res) + respos;
4246                         ressize += repsize-1;
4247                     }
4248                     memcpy(str, PyBytes_AsString(repunicode), repsize);
4249                     str += repsize;
4250                     p = startp + newpos;
4251                     Py_DECREF(repunicode);
4252                     break;
4253                 }
4254                 /* need more space? (at least enough for what we
4255                    have+the replacement+the rest of the string, so
4256                    we won't have to check space for encodable characters) */
4257                 respos = str - PyBytes_AS_STRING(res);
4258                 repsize = PyUnicode_GET_SIZE(repunicode);
4259                 requiredsize = respos+repsize+(endp-collend);
4260                 if (requiredsize > ressize) {
4261                     if (requiredsize<2*ressize)
4262                         requiredsize = 2*ressize;
4263                     if (_PyBytes_Resize(&res, requiredsize)) {
4264                         Py_DECREF(repunicode);
4265                         goto onError;
4266                     }
4267                     str = PyBytes_AS_STRING(res) + respos;
4268                     ressize = requiredsize;
4269                 }
4270                 /* check if there is anything unencodable in the replacement
4271                    and copy it to the output */
4272                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4273                     c = *uni2;
4274                     if (c >= limit) {
4275                         raise_encode_exception(&exc, encoding, startp, size,
4276                                                unicodepos, unicodepos+1, reason);
4277                         Py_DECREF(repunicode);
4278                         goto onError;
4279                     }
4280                     *str = (char)c;
4281                 }
4282                 p = startp + newpos;
4283                 Py_DECREF(repunicode);
4284             }
4285         }
4286     }
4287     /* Resize if we allocated to much */
4288     size = str - PyBytes_AS_STRING(res);
4289     if (size < ressize) { /* If this falls res will be NULL */
4290         assert(size >= 0);
4291         if (_PyBytes_Resize(&res, size) < 0)
4292             goto onError;
4293     }
4294
4295     Py_XDECREF(errorHandler);
4296     Py_XDECREF(exc);
4297     return res;
4298
4299   onError:
4300     Py_XDECREF(res);
4301     Py_XDECREF(errorHandler);
4302     Py_XDECREF(exc);
4303     return NULL;
4304 }
4305
4306 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4307                                  Py_ssize_t size,
4308                                  const char *errors)
4309 {
4310     return unicode_encode_ucs1(p, size, errors, 256);
4311 }
4312
4313 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4314 {
4315     if (!PyUnicode_Check(unicode)) {
4316         PyErr_BadArgument();
4317         return NULL;
4318     }
4319     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4320                                   PyUnicode_GET_SIZE(unicode),
4321                                   NULL);
4322 }
4323
4324 /* --- 7-bit ASCII Codec -------------------------------------------------- */
4325
4326 PyObject *PyUnicode_DecodeASCII(const char *s,
4327                                 Py_ssize_t size,
4328                                 const char *errors)
4329 {
4330     const char *starts = s;
4331     PyUnicodeObject *v;
4332     Py_UNICODE *p;
4333     Py_ssize_t startinpos;
4334     Py_ssize_t endinpos;
4335     Py_ssize_t outpos;
4336     const char *e;
4337     PyObject *errorHandler = NULL;
4338     PyObject *exc = NULL;
4339
4340     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4341     if (size == 1 && *(unsigned char*)s < 128) {
4342         Py_UNICODE r = *(unsigned char*)s;
4343         return PyUnicode_FromUnicode(&r, 1);
4344     }
4345
4346     v = _PyUnicode_New(size);
4347     if (v == NULL)
4348         goto onError;
4349     if (size == 0)
4350         return (PyObject *)v;
4351     p = PyUnicode_AS_UNICODE(v);
4352     e = s + size;
4353     while (s < e) {
4354         register unsigned char c = (unsigned char)*s;
4355         if (c < 128) {
4356             *p++ = c;
4357             ++s;
4358         }
4359         else {
4360             startinpos = s-starts;
4361             endinpos = startinpos + 1;
4362             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4363             if (unicode_decode_call_errorhandler(
4364                     errors, &errorHandler,
4365                     "ascii", "ordinal not in range(128)",
4366                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4367                     &v, &outpos, &p))
4368                 goto onError;
4369         }
4370     }
4371     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4372         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4373             goto onError;
4374     Py_XDECREF(errorHandler);
4375     Py_XDECREF(exc);
4376     return (PyObject *)v;
4377
4378   onError:
4379     Py_XDECREF(v);
4380     Py_XDECREF(errorHandler);
4381     Py_XDECREF(exc);
4382     return NULL;
4383 }
4384
4385 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4386                                 Py_ssize_t size,
4387                                 const char *errors)
4388 {
4389     return unicode_encode_ucs1(p, size, errors, 128);
4390 }
4391
4392 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4393 {
4394     if (!PyUnicode_Check(unicode)) {
4395         PyErr_BadArgument();
4396         return NULL;
4397     }
4398     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4399                                  PyUnicode_GET_SIZE(unicode),
4400                                  NULL);
4401 }
4402
4403 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4404
4405 /* --- MBCS codecs for Windows -------------------------------------------- */
4406
4407 #if SIZEOF_INT < SIZEOF_SIZE_T
4408 #define NEED_RETRY
4409 #endif
4410
4411 /* XXX This code is limited to "true" double-byte encodings, as
4412    a) it assumes an incomplete character consists of a single byte, and
4413    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4414    encodings, see IsDBCSLeadByteEx documentation. */
4415
4416 static int is_dbcs_lead_byte(const char *s, int offset)
4417 {
4418     const char *curr = s + offset;
4419
4420     if (IsDBCSLeadByte(*curr)) {
4421         const char *prev = CharPrev(s, curr);
4422         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4423     }
4424     return 0;
4425 }
4426
4427 /*
4428  * Decode MBCS string into unicode object. If 'final' is set, converts
4429  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4430  */
4431 static int decode_mbcs(PyUnicodeObject **v,
4432                        const char *s, /* MBCS string */
4433                        int size, /* sizeof MBCS string */
4434                        int final)
4435 {
4436     Py_UNICODE *p;
4437     Py_ssize_t n = 0;
4438     int usize = 0;
4439
4440     assert(size >= 0);
4441
4442     /* Skip trailing lead-byte unless 'final' is set */
4443     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4444         --size;
4445
4446     /* First get the size of the result */
4447     if (size > 0) {
4448         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4449         if (usize == 0) {
4450             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4451             return -1;
4452         }
4453     }
4454
4455     if (*v == NULL) {
4456         /* Create unicode object */
4457         *v = _PyUnicode_New(usize);
4458         if (*v == NULL)
4459             return -1;
4460     }
4461     else {
4462         /* Extend unicode object */
4463         n = PyUnicode_GET_SIZE(*v);
4464         if (_PyUnicode_Resize(v, n + usize) < 0)
4465             return -1;
4466     }
4467
4468     /* Do the conversion */
4469     if (size > 0) {
4470         p = PyUnicode_AS_UNICODE(*v) + n;
4471         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4472             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4473             return -1;
4474         }
4475     }
4476
4477     return size;
4478 }
4479
4480 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4481                                        Py_ssize_t size,
4482                                        const char *errors,
4483                                        Py_ssize_t *consumed)
4484 {
4485     PyUnicodeObject *v = NULL;
4486     int done;
4487
4488     if (consumed)
4489         *consumed = 0;
4490
4491 #ifdef NEED_RETRY
4492   retry:
4493     if (size > INT_MAX)
4494         done = decode_mbcs(&v, s, INT_MAX, 0);
4495     else
4496 #endif
4497         done = decode_mbcs(&v, s, (int)size, !consumed);
4498
4499     if (done < 0) {
4500         Py_XDECREF(v);
4501         return NULL;
4502     }
4503
4504     if (consumed)
4505         *consumed += done;
4506
4507 #ifdef NEED_RETRY
4508     if (size > INT_MAX) {
4509         s += done;
4510         size -= done;
4511         goto retry;
4512     }
4513 #endif
4514
4515     return (PyObject *)v;
4516 }
4517
4518 PyObject *PyUnicode_DecodeMBCS(const char *s,
4519                                Py_ssize_t size,
4520                                const char *errors)
4521 {
4522     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4523 }
4524
4525 /*
4526  * Convert unicode into string object (MBCS).
4527  * Returns 0 if succeed, -1 otherwise.
4528  */
4529 static int encode_mbcs(PyObject **repr,
4530                        const Py_UNICODE *p, /* unicode */
4531                        int size) /* size of unicode */
4532 {
4533     int mbcssize = 0;
4534     Py_ssize_t n = 0;
4535
4536     assert(size >= 0);
4537
4538     /* First get the size of the result */
4539     if (size > 0) {
4540         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4541         if (mbcssize == 0) {
4542             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4543             return -1;
4544         }
4545     }
4546
4547     if (*repr == NULL) {
4548         /* Create string object */
4549         *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4550         if (*repr == NULL)
4551             return -1;
4552     }
4553     else {
4554         /* Extend string object */
4555         n = PyBytes_Size(*repr);
4556         if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4557             return -1;
4558     }
4559
4560     /* Do the conversion */
4561     if (size > 0) {
4562         char *s = PyBytes_AS_STRING(*repr) + n;
4563         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4564             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4565             return -1;
4566         }
4567     }
4568
4569     return 0;
4570 }
4571
4572 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4573                                Py_ssize_t size,
4574                                const char *errors)
4575 {
4576     PyObject *repr = NULL;
4577     int ret;
4578
4579 #ifdef NEED_RETRY
4580   retry:
4581     if (size > INT_MAX)
4582         ret = encode_mbcs(&repr, p, INT_MAX);
4583     else
4584 #endif
4585         ret = encode_mbcs(&repr, p, (int)size);
4586
4587     if (ret < 0) {
4588         Py_XDECREF(repr);
4589         return NULL;
4590     }
4591
4592 #ifdef NEED_RETRY
4593     if (size > INT_MAX) {
4594         p += INT_MAX;
4595         size -= INT_MAX;
4596         goto retry;
4597     }
4598 #endif
4599
4600     return repr;
4601 }
4602
4603 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4604 {
4605     if (!PyUnicode_Check(unicode)) {
4606         PyErr_BadArgument();
4607         return NULL;
4608     }
4609     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4610                                 PyUnicode_GET_SIZE(unicode),
4611                                 NULL);
4612 }
4613
4614 #undef NEED_RETRY
4615
4616 #endif /* MS_WINDOWS */
4617
4618 /* --- Character Mapping Codec -------------------------------------------- */
4619
4620 PyObject *PyUnicode_DecodeCharmap(const char *s,
4621                                   Py_ssize_t size,
4622                                   PyObject *mapping,
4623                                   const char *errors)
4624 {
4625     const char *starts = s;
4626     Py_ssize_t startinpos;
4627     Py_ssize_t endinpos;
4628     Py_ssize_t outpos;
4629     const char *e;
4630     PyUnicodeObject *v;
4631     Py_UNICODE *p;
4632     Py_ssize_t extrachars = 0;
4633     PyObject *errorHandler = NULL;
4634     PyObject *exc = NULL;
4635     Py_UNICODE *mapstring = NULL;
4636     Py_ssize_t maplen = 0;
4637
4638     /* Default to Latin-1 */
4639     if (mapping == NULL)
4640         return PyUnicode_DecodeLatin1(s, size, errors);
4641
4642     v = _PyUnicode_New(size);
4643     if (v == NULL)
4644         goto onError;
4645     if (size == 0)
4646         return (PyObject *)v;
4647     p = PyUnicode_AS_UNICODE(v);
4648     e = s + size;
4649     if (PyUnicode_CheckExact(mapping)) {
4650         mapstring = PyUnicode_AS_UNICODE(mapping);
4651         maplen = PyUnicode_GET_SIZE(mapping);
4652         while (s < e) {
4653             unsigned char ch = *s;
4654             Py_UNICODE x = 0xfffe; /* illegal value */
4655
4656             if (ch < maplen)
4657                 x = mapstring[ch];
4658
4659             if (x == 0xfffe) {
4660                 /* undefined mapping */
4661                 outpos = p-PyUnicode_AS_UNICODE(v);
4662                 startinpos = s-starts;
4663                 endinpos = startinpos+1;
4664                 if (unicode_decode_call_errorhandler(
4665                         errors, &errorHandler,
4666                         "charmap", "character maps to <undefined>",
4667                         &starts, &e, &startinpos, &endinpos, &exc, &s,
4668                         &v, &outpos, &p)) {
4669                     goto onError;
4670                 }
4671                 continue;
4672             }
4673             *p++ = x;
4674             ++s;
4675         }
4676     }
4677     else {
4678         while (s < e) {
4679             unsigned char ch = *s;
4680             PyObject *w, *x;
4681
4682             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4683             w = PyLong_FromLong((long)ch);
4684             if (w == NULL)
4685                 goto onError;
4686             x = PyObject_GetItem(mapping, w);
4687             Py_DECREF(w);
4688             if (x == NULL) {
4689                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4690                     /* No mapping found means: mapping is undefined. */
4691                     PyErr_Clear();
4692                     x = Py_None;
4693                     Py_INCREF(x);
4694                 } else
4695                     goto onError;
4696             }
4697
4698             /* Apply mapping */
4699             if (PyLong_Check(x)) {
4700                 long value = PyLong_AS_LONG(x);
4701                 if (value < 0 || value > 65535) {
4702                     PyErr_SetString(PyExc_TypeError,
4703                                     "character mapping must be in range(65536)");
4704                     Py_DECREF(x);
4705                     goto onError;
4706                 }
4707                 *p++ = (Py_UNICODE)value;
4708             }
4709             else if (x == Py_None) {
4710                 /* undefined mapping */
4711                 outpos = p-PyUnicode_AS_UNICODE(v);
4712                 startinpos = s-starts;
4713                 endinpos = startinpos+1;
4714                 if (unicode_decode_call_errorhandler(
4715                         errors, &errorHandler,
4716                         "charmap", "character maps to <undefined>",
4717                         &starts, &e, &startinpos, &endinpos, &exc, &s,
4718                         &v, &outpos, &p)) {
4719                     Py_DECREF(x);
4720                     goto onError;
4721                 }
4722                 Py_DECREF(x);
4723                 continue;
4724             }
4725             else if (PyUnicode_Check(x)) {
4726                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4727
4728                 if (targetsize == 1)
4729                     /* 1-1 mapping */
4730                     *p++ = *PyUnicode_AS_UNICODE(x);
4731
4732                 else if (targetsize > 1) {
4733                     /* 1-n mapping */
4734                     if (targetsize > extrachars) {
4735                         /* resize first */
4736                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4737                         Py_ssize_t needed = (targetsize - extrachars) + \
4738                             (targetsize << 2);
4739                         extrachars += needed;
4740                         /* XXX overflow detection missing */
4741                         if (_PyUnicode_Resize(&v,
4742                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4743                             Py_DECREF(x);
4744                             goto onError;
4745                         }
4746                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4747                     }
4748                     Py_UNICODE_COPY(p,
4749                                     PyUnicode_AS_UNICODE(x),
4750                                     targetsize);
4751                     p += targetsize;
4752                     extrachars -= targetsize;
4753                 }
4754                 /* 1-0 mapping: skip the character */
4755             }
4756             else {
4757                 /* wrong return value */
4758                 PyErr_SetString(PyExc_TypeError,
4759                                 "character mapping must return integer, None or str");
4760                 Py_DECREF(x);
4761                 goto onError;
4762             }
4763             Py_DECREF(x);
4764             ++s;
4765         }
4766     }
4767     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4768         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4769             goto onError;
4770     Py_XDECREF(errorHandler);
4771     Py_XDECREF(exc);
4772     return (PyObject *)v;
4773
4774   onError:
4775     Py_XDECREF(errorHandler);
4776     Py_XDECREF(exc);
4777     Py_XDECREF(v);
4778     return NULL;
4779 }
4780
4781 /* Charmap encoding: the lookup table */
4782
4783 struct encoding_map{
4784     PyObject_HEAD
4785     unsigned char level1[32];
4786     int count2, count3;
4787     unsigned char level23[1];
4788 };
4789
4790 static PyObject*
4791 encoding_map_size(PyObject *obj, PyObject* args)
4792 {
4793     struct encoding_map *map = (struct encoding_map*)obj;
4794     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4795                            128*map->count3);
4796 }
4797
4798 static PyMethodDef encoding_map_methods[] = {
4799     {"size", encoding_map_size, METH_NOARGS,
4800      PyDoc_STR("Return the size (in bytes) of this object") },
4801     { 0 }
4802 };
4803
4804 static void
4805 encoding_map_dealloc(PyObject* o)
4806 {
4807     PyObject_FREE(o);
4808 }
4809
4810 static PyTypeObject EncodingMapType = {
4811     PyVarObject_HEAD_INIT(NULL, 0)
4812     "EncodingMap",          /*tp_name*/
4813     sizeof(struct encoding_map),   /*tp_basicsize*/
4814     0,                      /*tp_itemsize*/
4815     /* methods */
4816     encoding_map_dealloc,   /*tp_dealloc*/
4817     0,                      /*tp_print*/
4818     0,                      /*tp_getattr*/
4819     0,                      /*tp_setattr*/
4820     0,                      /*tp_reserved*/
4821     0,                      /*tp_repr*/
4822     0,                      /*tp_as_number*/
4823     0,                      /*tp_as_sequence*/
4824     0,                      /*tp_as_mapping*/
4825     0,                      /*tp_hash*/
4826     0,                      /*tp_call*/
4827     0,                      /*tp_str*/
4828     0,                      /*tp_getattro*/
4829     0,                      /*tp_setattro*/
4830     0,                      /*tp_as_buffer*/
4831     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4832     0,                      /*tp_doc*/
4833     0,                      /*tp_traverse*/
4834     0,                      /*tp_clear*/
4835     0,                      /*tp_richcompare*/
4836     0,                      /*tp_weaklistoffset*/
4837     0,                      /*tp_iter*/
4838     0,                      /*tp_iternext*/
4839     encoding_map_methods,   /*tp_methods*/
4840     0,                      /*tp_members*/
4841     0,                      /*tp_getset*/
4842     0,                      /*tp_base*/
4843     0,                      /*tp_dict*/
4844     0,                      /*tp_descr_get*/
4845     0,                      /*tp_descr_set*/
4846     0,                      /*tp_dictoffset*/
4847     0,                      /*tp_init*/
4848     0,                      /*tp_alloc*/
4849     0,                      /*tp_new*/
4850     0,                      /*tp_free*/
4851     0,                      /*tp_is_gc*/
4852 };
4853
4854 PyObject*
4855 PyUnicode_BuildEncodingMap(PyObject* string)
4856 {
4857     Py_UNICODE *decode;
4858     PyObject *result;
4859     struct encoding_map *mresult;
4860     int i;
4861     int need_dict = 0;
4862     unsigned char level1[32];
4863     unsigned char level2[512];
4864     unsigned char *mlevel1, *mlevel2, *mlevel3;
4865     int count2 = 0, count3 = 0;
4866
4867     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4868         PyErr_BadArgument();
4869         return NULL;
4870     }
4871     decode = PyUnicode_AS_UNICODE(string);
4872     memset(level1, 0xFF, sizeof level1);
4873     memset(level2, 0xFF, sizeof level2);
4874
4875     /* If there isn't a one-to-one mapping of NULL to \0,
4876        or if there are non-BMP characters, we need to use
4877        a mapping dictionary. */
4878     if (decode[0] != 0)
4879         need_dict = 1;
4880     for (i = 1; i < 256; i++) {
4881         int l1, l2;
4882         if (decode[i] == 0
4883 #ifdef Py_UNICODE_WIDE
4884             || decode[i] > 0xFFFF
4885 #endif
4886             ) {
4887             need_dict = 1;
4888             break;
4889         }
4890         if (decode[i] == 0xFFFE)
4891             /* unmapped character */
4892             continue;
4893         l1 = decode[i] >> 11;
4894         l2 = decode[i] >> 7;
4895         if (level1[l1] == 0xFF)
4896             level1[l1] = count2++;
4897         if (level2[l2] == 0xFF)
4898             level2[l2] = count3++;
4899     }
4900
4901     if (count2 >= 0xFF || count3 >= 0xFF)
4902         need_dict = 1;
4903
4904     if (need_dict) {
4905         PyObject *result = PyDict_New();
4906         PyObject *key, *value;
4907         if (!result)
4908             return NULL;
4909         for (i = 0; i < 256; i++) {
4910             key = value = NULL;
4911             key = PyLong_FromLong(decode[i]);
4912             value = PyLong_FromLong(i);
4913             if (!key || !value)
4914                 goto failed1;
4915             if (PyDict_SetItem(result, key, value) == -1)
4916                 goto failed1;
4917             Py_DECREF(key);
4918             Py_DECREF(value);
4919         }
4920         return result;
4921       failed1:
4922         Py_XDECREF(key);
4923         Py_XDECREF(value);
4924         Py_DECREF(result);
4925         return NULL;
4926     }
4927
4928     /* Create a three-level trie */
4929     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4930                              16*count2 + 128*count3 - 1);
4931     if (!result)
4932         return PyErr_NoMemory();
4933     PyObject_Init(result, &EncodingMapType);
4934     mresult = (struct encoding_map*)result;
4935     mresult->count2 = count2;
4936     mresult->count3 = count3;
4937     mlevel1 = mresult->level1;
4938     mlevel2 = mresult->level23;
4939     mlevel3 = mresult->level23 + 16*count2;
4940     memcpy(mlevel1, level1, 32);
4941     memset(mlevel2, 0xFF, 16*count2);
4942     memset(mlevel3, 0, 128*count3);
4943     count3 = 0;
4944     for (i = 1; i < 256; i++) {
4945         int o1, o2, o3, i2, i3;
4946         if (decode[i] == 0xFFFE)
4947             /* unmapped character */
4948             continue;
4949         o1 = decode[i]>>11;
4950         o2 = (decode[i]>>7) & 0xF;
4951         i2 = 16*mlevel1[o1] + o2;
4952         if (mlevel2[i2] == 0xFF)
4953             mlevel2[i2] = count3++;
4954         o3 = decode[i] & 0x7F;
4955         i3 = 128*mlevel2[i2] + o3;
4956         mlevel3[i3] = i;
4957     }
4958     return result;
4959 }
4960
4961 static int
4962 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4963 {
4964     struct encoding_map *map = (struct encoding_map*)mapping;
4965     int l1 = c>>11;
4966     int l2 = (c>>7) & 0xF;
4967     int l3 = c & 0x7F;
4968     int i;
4969
4970 #ifdef Py_UNICODE_WIDE
4971     if (c > 0xFFFF) {
4972         return -1;
4973     }
4974 #endif
4975     if (c == 0)
4976         return 0;
4977     /* level 1*/
4978     i = map->level1[l1];
4979     if (i == 0xFF) {
4980         return -1;
4981     }
4982     /* level 2*/
4983     i = map->level23[16*i+l2];
4984     if (i == 0xFF) {
4985         return -1;
4986     }
4987     /* level 3 */
4988     i = map->level23[16*map->count2 + 128*i + l3];
4989     if (i == 0) {
4990         return -1;
4991     }
4992     return i;
4993 }
4994
4995 /* Lookup the character ch in the mapping. If the character
4996    can't be found, Py_None is returned (or NULL, if another
4997    error occurred). */
4998 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4999 {
5000     PyObject *w = PyLong_FromLong((long)c);
5001     PyObject *x;
5002
5003     if (w == NULL)
5004         return NULL;
5005     x = PyObject_GetItem(mapping, w);
5006     Py_DECREF(w);
5007     if (x == NULL) {
5008         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5009             /* No mapping found means: mapping is undefined. */
5010             PyErr_Clear();
5011             x = Py_None;
5012             Py_INCREF(x);
5013             return x;
5014         } else
5015             return NULL;
5016     }
5017     else if (x == Py_None)
5018         return x;
5019     else if (PyLong_Check(x)) {
5020         long value = PyLong_AS_LONG(x);
5021         if (value < 0 || value > 255) {
5022             PyErr_SetString(PyExc_TypeError,
5023                             "character mapping must be in range(256)");
5024             Py_DECREF(x);
5025             return NULL;
5026         }
5027         return x;
5028     }
5029     else if (PyBytes_Check(x))
5030         return x;
5031     else {
5032         /* wrong return value */
5033         PyErr_Format(PyExc_TypeError,
5034                      "character mapping must return integer, bytes or None, not %.400s",
5035                      x->ob_type->tp_name);
5036         Py_DECREF(x);
5037         return NULL;
5038     }
5039 }
5040
5041 static int
5042 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5043 {
5044     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5045     /* exponentially overallocate to minimize reallocations */
5046     if (requiredsize < 2*outsize)
5047         requiredsize = 2*outsize;
5048     if (_PyBytes_Resize(outobj, requiredsize))
5049         return -1;
5050     return 0;
5051 }
5052
5053 typedef enum charmapencode_result {
5054     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5055 }charmapencode_result;
5056 /* lookup the character, put the result in the output string and adjust
5057    various state variables. Resize the output bytes object if not enough
5058    space is available. Return a new reference to the object that
5059    was put in the output buffer, or Py_None, if the mapping was undefined
5060    (in which case no character was written) or NULL, if a
5061    reallocation error occurred. The caller must decref the result */
5062 static
5063 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5064                                           PyObject **outobj, Py_ssize_t *outpos)
5065 {
5066     PyObject *rep;
5067     char *outstart;
5068     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5069
5070     if (Py_TYPE(mapping) == &EncodingMapType) {
5071         int res = encoding_map_lookup(c, mapping);
5072         Py_ssize_t requiredsize = *outpos+1;
5073         if (res == -1)
5074             return enc_FAILED;
5075         if (outsize<requiredsize)
5076             if (charmapencode_resize(outobj, outpos, requiredsize))
5077                 return enc_EXCEPTION;
5078         outstart = PyBytes_AS_STRING(*outobj);
5079         outstart[(*outpos)++] = (char)res;
5080         return enc_SUCCESS;
5081     }
5082
5083     rep = charmapencode_lookup(c, mapping);
5084     if (rep==NULL)
5085         return enc_EXCEPTION;
5086     else if (rep==Py_None) {
5087         Py_DECREF(rep);
5088         return enc_FAILED;
5089     } else {
5090         if (PyLong_Check(rep)) {
5091             Py_ssize_t requiredsize = *outpos+1;
5092             if (outsize<requiredsize)
5093                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5094                     Py_DECREF(rep);
5095                     return enc_EXCEPTION;
5096                 }
5097             outstart = PyBytes_AS_STRING(*outobj);
5098             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5099         }
5100         else {
5101             const char *repchars = PyBytes_AS_STRING(rep);
5102             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5103             Py_ssize_t requiredsize = *outpos+repsize;
5104             if (outsize<requiredsize)
5105                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5106                     Py_DECREF(rep);
5107                     return enc_EXCEPTION;
5108                 }
5109             outstart = PyBytes_AS_STRING(*outobj);
5110             memcpy(outstart + *outpos, repchars, repsize);
5111             *outpos += repsize;
5112         }
5113     }
5114     Py_DECREF(rep);
5115     return enc_SUCCESS;
5116 }
5117
5118 /* handle an error in PyUnicode_EncodeCharmap
5119    Return 0 on success, -1 on error */
5120 static
5121 int charmap_encoding_error(
5122     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5123     PyObject **exceptionObject,
5124     int *known_errorHandler, PyObject **errorHandler, const char *errors,
5125     PyObject **res, Py_ssize_t *respos)
5126 {
5127     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5128     Py_ssize_t repsize;
5129     Py_ssize_t newpos;
5130     Py_UNICODE *uni2;
5131     /* startpos for collecting unencodable chars */
5132     Py_ssize_t collstartpos = *inpos;
5133     Py_ssize_t collendpos = *inpos+1;
5134     Py_ssize_t collpos;
5135     char *encoding = "charmap";
5136     char *reason = "character maps to <undefined>";
5137     charmapencode_result x;
5138
5139     /* find all unencodable characters */
5140     while (collendpos < size) {
5141         PyObject *rep;
5142         if (Py_TYPE(mapping) == &EncodingMapType) {
5143             int res = encoding_map_lookup(p[collendpos], mapping);
5144             if (res != -1)
5145                 break;
5146             ++collendpos;
5147             continue;
5148         }
5149
5150         rep = charmapencode_lookup(p[collendpos], mapping);
5151         if (rep==NULL)
5152             return -1;
5153         else if (rep!=Py_None) {
5154             Py_DECREF(rep);
5155             break;
5156         }
5157         Py_DECREF(rep);
5158         ++collendpos;
5159     }
5160     /* cache callback name lookup
5161      * (if not done yet, i.e. it's the first error) */
5162     if (*known_errorHandler==-1) {
5163         if ((errors==NULL) || (!strcmp(errors, "strict")))
5164             *known_errorHandler = 1;
5165         else if (!strcmp(errors, "replace"))
5166             *known_errorHandler = 2;
5167         else if (!strcmp(errors, "ignore"))
5168             *known_errorHandler = 3;
5169         else if (!strcmp(errors, "xmlcharrefreplace"))
5170             *known_errorHandler = 4;
5171         else
5172             *known_errorHandler = 0;
5173     }
5174     switch (*known_errorHandler) {
5175     case 1: /* strict */
5176         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5177         return -1;
5178     case 2: /* replace */
5179         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5180             x = charmapencode_output('?', mapping, res, respos);
5181             if (x==enc_EXCEPTION) {
5182                 return -1;
5183             }
5184             else if (x==enc_FAILED) {
5185                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5186                 return -1;
5187             }
5188         }
5189         /* fall through */
5190     case 3: /* ignore */
5191         *inpos = collendpos;
5192         break;
5193     case 4: /* xmlcharrefreplace */
5194         /* generate replacement (temporarily (mis)uses p) */
5195         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5196             char buffer[2+29+1+1];
5197             char *cp;
5198             sprintf(buffer, "&#%d;", (int)p[collpos]);
5199             for (cp = buffer; *cp; ++cp) {
5200                 x = charmapencode_output(*cp, mapping, res, respos);
5201                 if (x==enc_EXCEPTION)
5202                     return -1;
5203                 else if (x==enc_FAILED) {
5204                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5205                     return -1;
5206                 }
5207             }
5208         }
5209         *inpos = collendpos;
5210         break;
5211     default:
5212         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5213                                                       encoding, reason, p, size, exceptionObject,
5214                                                       collstartpos, collendpos, &newpos);
5215         if (repunicode == NULL)
5216             return -1;
5217         if (PyBytes_Check(repunicode)) {
5218             /* Directly copy bytes result to output. */
5219             Py_ssize_t outsize = PyBytes_Size(*res);
5220             Py_ssize_t requiredsize;
5221             repsize = PyBytes_Size(repunicode);
5222             requiredsize = *respos + repsize;
5223             if (requiredsize > outsize)
5224                 /* Make room for all additional bytes. */
5225                 if (charmapencode_resize(res, respos, requiredsize)) {
5226                     Py_DECREF(repunicode);
5227                     return -1;
5228                 }
5229             memcpy(PyBytes_AsString(*res) + *respos,
5230                    PyBytes_AsString(repunicode),  repsize);
5231             *respos += repsize;
5232             *inpos = newpos;
5233             Py_DECREF(repunicode);
5234             break;
5235         }
5236         /* generate replacement  */
5237         repsize = PyUnicode_GET_SIZE(repunicode);
5238         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5239             x = charmapencode_output(*uni2, mapping, res, respos);
5240             if (x==enc_EXCEPTION) {
5241                 return -1;
5242             }
5243             else if (x==enc_FAILED) {
5244                 Py_DECREF(repunicode);
5245                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5246                 return -1;
5247             }
5248         }
5249         *inpos = newpos;
5250         Py_DECREF(repunicode);
5251     }
5252     return 0;
5253 }
5254
5255 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5256                                   Py_ssize_t size,
5257                                   PyObject *mapping,
5258                                   const char *errors)
5259 {
5260     /* output object */
5261     PyObject *res = NULL;
5262     /* current input position */
5263     Py_ssize_t inpos = 0;
5264     /* current output position */
5265     Py_ssize_t respos = 0;
5266     PyObject *errorHandler = NULL;
5267     PyObject *exc = NULL;
5268     /* the following variable is used for caching string comparisons
5269      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5270      * 3=ignore, 4=xmlcharrefreplace */
5271     int known_errorHandler = -1;
5272
5273     /* Default to Latin-1 */
5274     if (mapping == NULL)
5275         return PyUnicode_EncodeLatin1(p, size, errors);
5276
5277     /* allocate enough for a simple encoding without
5278        replacements, if we need more, we'll resize */
5279     res = PyBytes_FromStringAndSize(NULL, size);
5280     if (res == NULL)
5281         goto onError;
5282     if (size == 0)
5283         return res;
5284
5285     while (inpos<size) {
5286         /* try to encode it */
5287         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5288         if (x==enc_EXCEPTION) /* error */
5289             goto onError;
5290         if (x==enc_FAILED) { /* unencodable character */
5291             if (charmap_encoding_error(p, size, &inpos, mapping,
5292                                        &exc,
5293                                        &known_errorHandler, &errorHandler, errors,
5294                                        &res, &respos)) {
5295                 goto onError;
5296             }
5297         }
5298         else
5299             /* done with this character => adjust input position */
5300             ++inpos;
5301     }
5302
5303     /* Resize if we allocated to much */
5304     if (respos<PyBytes_GET_SIZE(res))
5305         if (_PyBytes_Resize(&res, respos) < 0)
5306             goto onError;
5307
5308     Py_XDECREF(exc);
5309     Py_XDECREF(errorHandler);
5310     return res;
5311
5312   onError:
5313     Py_XDECREF(res);
5314     Py_XDECREF(exc);
5315     Py_XDECREF(errorHandler);
5316     return NULL;
5317 }
5318
5319 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5320                                     PyObject *mapping)
5321 {
5322     if (!PyUnicode_Check(unicode) || mapping == NULL) {
5323         PyErr_BadArgument();
5324         return NULL;
5325     }
5326     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5327                                    PyUnicode_GET_SIZE(unicode),
5328                                    mapping,
5329                                    NULL);
5330 }
5331
5332 /* create or adjust a UnicodeTranslateError */
5333 static void make_translate_exception(PyObject **exceptionObject,
5334                                      const Py_UNICODE *unicode, Py_ssize_t size,
5335                                      Py_ssize_t startpos, Py_ssize_t endpos,
5336                                      const char *reason)
5337 {
5338     if (*exceptionObject == NULL) {
5339         *exceptionObject = PyUnicodeTranslateError_Create(
5340             unicode, size, startpos, endpos, reason);
5341     }
5342     else {
5343         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5344             goto onError;
5345         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5346             goto onError;
5347         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5348             goto onError;
5349         return;
5350       onError:
5351         Py_DECREF(*exceptionObject);
5352         *exceptionObject = NULL;
5353     }
5354 }
5355
5356 /* raises a UnicodeTranslateError */
5357 static void raise_translate_exception(PyObject **exceptionObject,
5358                                       const Py_UNICODE *unicode, Py_ssize_t size,
5359                                       Py_ssize_t startpos, Py_ssize_t endpos,
5360                                       const char *reason)
5361 {
5362     make_translate_exception(exceptionObject,
5363                              unicode, size, startpos, endpos, reason);
5364     if (*exceptionObject != NULL)
5365         PyCodec_StrictErrors(*exceptionObject);
5366 }
5367
5368 /* error handling callback helper:
5369    build arguments, call the callback and check the arguments,
5370    put the result into newpos and return the replacement string, which
5371    has to be freed by the caller */
5372 static PyObject *unicode_translate_call_errorhandler(const char *errors,
5373                                                      PyObject **errorHandler,
5374                                                      const char *reason,
5375                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5376                                                      Py_ssize_t startpos, Py_ssize_t endpos,
5377                                                      Py_ssize_t *newpos)
5378 {
5379     static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5380
5381     Py_ssize_t i_newpos;
5382     PyObject *restuple;
5383     PyObject *resunicode;
5384
5385     if (*errorHandler == NULL) {
5386         *errorHandler = PyCodec_LookupError(errors);
5387         if (*errorHandler == NULL)
5388             return NULL;
5389     }
5390
5391     make_translate_exception(exceptionObject,
5392                              unicode, size, startpos, endpos, reason);
5393     if (*exceptionObject == NULL)
5394         return NULL;
5395
5396     restuple = PyObject_CallFunctionObjArgs(
5397         *errorHandler, *exceptionObject, NULL);
5398     if (restuple == NULL)
5399         return NULL;
5400     if (!PyTuple_Check(restuple)) {
5401         PyErr_SetString(PyExc_TypeError, &argparse[4]);
5402         Py_DECREF(restuple);
5403         return NULL;
5404     }
5405     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5406                           &resunicode, &i_newpos)) {
5407         Py_DECREF(restuple);
5408         return NULL;
5409     }
5410     if (i_newpos<0)
5411         *newpos = size+i_newpos;
5412     else
5413         *newpos = i_newpos;
5414     if (*newpos<0 || *newpos>size) {
5415         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5416         Py_DECREF(restuple);
5417         return NULL;
5418     }
5419     Py_INCREF(resunicode);
5420     Py_DECREF(restuple);
5421     return resunicode;
5422 }
5423
5424 /* Lookup the character ch in the mapping and put the result in result,
5425    which must be decrefed by the caller.
5426    Return 0 on success, -1 on error */
5427 static
5428 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5429 {
5430     PyObject *w = PyLong_FromLong((long)c);
5431     PyObject *x;
5432
5433     if (w == NULL)
5434         return -1;
5435     x = PyObject_GetItem(mapping, w);
5436     Py_DECREF(w);
5437     if (x == NULL) {
5438         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5439             /* No mapping found means: use 1:1 mapping. */
5440             PyErr_Clear();
5441             *result = NULL;
5442             return 0;
5443         } else
5444             return -1;
5445     }
5446     else if (x == Py_None) {
5447         *result = x;
5448         return 0;
5449     }
5450     else if (PyLong_Check(x)) {
5451         long value = PyLong_AS_LONG(x);
5452         long max = PyUnicode_GetMax();
5453         if (value < 0 || value > max) {
5454             PyErr_Format(PyExc_TypeError,
5455                          "character mapping must be in range(0x%x)", max+1);
5456             Py_DECREF(x);
5457             return -1;
5458         }
5459         *result = x;
5460         return 0;
5461     }
5462     else if (PyUnicode_Check(x)) {
5463         *result = x;
5464         return 0;
5465     }
5466     else {
5467         /* wrong return value */
5468         PyErr_SetString(PyExc_TypeError,
5469                         "character mapping must return integer, None or str");
5470         Py_DECREF(x);
5471         return -1;
5472     }
5473 }
5474 /* ensure that *outobj is at least requiredsize characters long,
5475    if not reallocate and adjust various state variables.
5476    Return 0 on success, -1 on error */
5477 static
5478 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5479                                Py_ssize_t requiredsize)
5480 {
5481     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5482     if (requiredsize > oldsize) {
5483         /* remember old output position */
5484         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5485         /* exponentially overallocate to minimize reallocations */
5486         if (requiredsize < 2 * oldsize)
5487             requiredsize = 2 * oldsize;
5488         if (PyUnicode_Resize(outobj, requiredsize) < 0)
5489             return -1;
5490         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5491     }
5492     return 0;
5493 }
5494 /* lookup the character, put the result in the output string and adjust
5495    various state variables. Return a new reference to the object that
5496    was put in the output buffer in *result, or Py_None, if the mapping was
5497    undefined (in which case no character was written).
5498    The called must decref result.
5499    Return 0 on success, -1 on error. */
5500 static
5501 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5502                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5503                             PyObject **res)
5504 {
5505     if (charmaptranslate_lookup(*curinp, mapping, res))
5506         return -1;
5507     if (*res==NULL) {
5508         /* not found => default to 1:1 mapping */
5509         *(*outp)++ = *curinp;
5510     }
5511     else if (*res==Py_None)
5512         ;
5513     else if (PyLong_Check(*res)) {
5514         /* no overflow check, because we know that the space is enough */
5515         *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5516     }
5517     else if (PyUnicode_Check(*res)) {
5518         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5519         if (repsize==1) {
5520             /* no overflow check, because we know that the space is enough */
5521             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5522         }
5523         else if (repsize!=0) {
5524             /* more than one character */
5525             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5526                 (insize - (curinp-startinp)) +
5527                 repsize - 1;
5528             if (charmaptranslate_makespace(outobj, outp, requiredsize))
5529                 return -1;
5530             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5531             *outp += repsize;
5532         }
5533     }
5534     else
5535         return -1;
5536     return 0;
5537 }
5538
5539 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5540                                      Py_ssize_t size,
5541                                      PyObject *mapping,
5542                                      const char *errors)
5543 {
5544     /* output object */
5545     PyObject *res = NULL;
5546     /* pointers to the beginning and end+1 of input */
5547     const Py_UNICODE *startp = p;
5548     const Py_UNICODE *endp = p + size;
5549     /* pointer into the output */
5550     Py_UNICODE *str;
5551     /* current output position */
5552     Py_ssize_t respos = 0;
5553     char *reason = "character maps to <undefined>";
5554     PyObject *errorHandler = NULL;
5555     PyObject *exc = NULL;
5556     /* the following variable is used for caching string comparisons
5557      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5558      * 3=ignore, 4=xmlcharrefreplace */
5559     int known_errorHandler = -1;
5560
5561     if (mapping == NULL) {
5562         PyErr_BadArgument();
5563         return NULL;
5564     }
5565
5566     /* allocate enough for a simple 1:1 translation without
5567        replacements, if we need more, we'll resize */
5568     res = PyUnicode_FromUnicode(NULL, size);
5569     if (res == NULL)
5570         goto onError;
5571     if (size == 0)
5572         return res;
5573     str = PyUnicode_AS_UNICODE(res);
5574
5575     while (p<endp) {
5576         /* try to encode it */
5577         PyObject *x = NULL;
5578         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5579             Py_XDECREF(x);
5580             goto onError;
5581         }
5582         Py_XDECREF(x);
5583         if (x!=Py_None) /* it worked => adjust input pointer */
5584             ++p;
5585         else { /* untranslatable character */
5586             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5587             Py_ssize_t repsize;
5588             Py_ssize_t newpos;
5589             Py_UNICODE *uni2;
5590             /* startpos for collecting untranslatable chars */
5591             const Py_UNICODE *collstart = p;
5592             const Py_UNICODE *collend = p+1;
5593             const Py_UNICODE *coll;
5594
5595             /* find all untranslatable characters */
5596             while (collend < endp) {
5597                 if (charmaptranslate_lookup(*collend, mapping, &x))
5598                     goto onError;
5599                 Py_XDECREF(x);
5600                 if (x!=Py_None)
5601                     break;
5602                 ++collend;
5603             }
5604             /* cache callback name lookup
5605              * (if not done yet, i.e. it's the first error) */
5606             if (known_errorHandler==-1) {
5607                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5608                     known_errorHandler = 1;
5609                 else if (!strcmp(errors, "replace"))
5610                     known_errorHandler = 2;
5611                 else if (!strcmp(errors, "ignore"))
5612                     known_errorHandler = 3;
5613                 else if (!strcmp(errors, "xmlcharrefreplace"))
5614                     known_errorHandler = 4;
5615                 else
5616                     known_errorHandler = 0;
5617             }
5618             switch (known_errorHandler) {
5619             case 1: /* strict */
5620                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5621                 goto onError;
5622             case 2: /* replace */
5623                 /* No need to check for space, this is a 1:1 replacement */
5624                 for (coll = collstart; coll<collend; ++coll)
5625                     *str++ = '?';
5626                 /* fall through */
5627             case 3: /* ignore */
5628                 p = collend;
5629                 break;
5630             case 4: /* xmlcharrefreplace */
5631                 /* generate replacement (temporarily (mis)uses p) */
5632                 for (p = collstart; p < collend; ++p) {
5633                     char buffer[2+29+1+1];
5634                     char *cp;
5635                     sprintf(buffer, "&#%d;", (int)*p);
5636                     if (charmaptranslate_makespace(&res, &str,
5637                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5638                         goto onError;
5639                     for (cp = buffer; *cp; ++cp)
5640                         *str++ = *cp;
5641                 }
5642                 p = collend;
5643                 break;
5644             default:
5645                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5646                                                                  reason, startp, size, &exc,
5647                                                                  collstart-startp, collend-startp, &newpos);
5648                 if (repunicode == NULL)
5649                     goto onError;
5650                 /* generate replacement  */
5651                 repsize = PyUnicode_GET_SIZE(repunicode);
5652                 if (charmaptranslate_makespace(&res, &str,
5653                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5654                     Py_DECREF(repunicode);
5655                     goto onError;
5656                 }
5657                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5658                     *str++ = *uni2;
5659                 p = startp + newpos;
5660                 Py_DECREF(repunicode);
5661             }
5662         }
5663     }
5664     /* Resize if we allocated to much */
5665     respos = str-PyUnicode_AS_UNICODE(res);
5666     if (respos<PyUnicode_GET_SIZE(res)) {
5667         if (PyUnicode_Resize(&res, respos) < 0)
5668             goto onError;
5669     }
5670     Py_XDECREF(exc);
5671     Py_XDECREF(errorHandler);
5672     return res;
5673
5674   onError:
5675     Py_XDECREF(res);
5676     Py_XDECREF(exc);
5677     Py_XDECREF(errorHandler);
5678     return NULL;
5679 }
5680
5681 PyObject *PyUnicode_Translate(PyObject *str,
5682                               PyObject *mapping,
5683                               const char *errors)
5684 {
5685     PyObject *result;
5686
5687     str = PyUnicode_FromObject(str);
5688     if (str == NULL)
5689         goto onError;
5690     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5691                                         PyUnicode_GET_SIZE(str),
5692                                         mapping,
5693                                         errors);
5694     Py_DECREF(str);
5695     return result;
5696
5697   onError:
5698     Py_XDECREF(str);
5699     return NULL;
5700 }
5701
5702 /* --- Decimal Encoder ---------------------------------------------------- */
5703
5704 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5705                             Py_ssize_t length,
5706                             char *output,
5707                             const char *errors)
5708 {
5709     Py_UNICODE *p, *end;
5710     PyObject *errorHandler = NULL;
5711     PyObject *exc = NULL;
5712     const char *encoding = "decimal";
5713     const char *reason = "invalid decimal Unicode string";
5714     /* the following variable is used for caching string comparisons
5715      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5716     int known_errorHandler = -1;
5717
5718     if (output == NULL) {
5719         PyErr_BadArgument();
5720         return -1;
5721     }
5722
5723     p = s;
5724     end = s + length;
5725     while (p < end) {
5726         register Py_UNICODE ch = *p;
5727         int decimal;
5728         PyObject *repunicode;
5729         Py_ssize_t repsize;
5730         Py_ssize_t newpos;
5731         Py_UNICODE *uni2;
5732         Py_UNICODE *collstart;
5733         Py_UNICODE *collend;
5734
5735         if (Py_UNICODE_ISSPACE(ch)) {
5736             *output++ = ' ';
5737             ++p;
5738             continue;
5739         }
5740         decimal = Py_UNICODE_TODECIMAL(ch);
5741         if (decimal >= 0) {
5742             *output++ = '0' + decimal;
5743             ++p;
5744             continue;
5745         }
5746         if (0 < ch && ch < 256) {
5747             *output++ = (char)ch;
5748             ++p;
5749             continue;
5750         }
5751         /* All other characters are considered unencodable */
5752         collstart = p;
5753         collend = p+1;
5754         while (collend < end) {
5755             if ((0 < *collend && *collend < 256) ||
5756                 !Py_UNICODE_ISSPACE(*collend) ||
5757                 Py_UNICODE_TODECIMAL(*collend))
5758                 break;
5759         }
5760         /* cache callback name lookup
5761          * (if not done yet, i.e. it's the first error) */
5762         if (known_errorHandler==-1) {
5763             if ((errors==NULL) || (!strcmp(errors, "strict")))
5764                 known_errorHandler = 1;
5765             else if (!strcmp(errors, "replace"))
5766                 known_errorHandler = 2;
5767             else if (!strcmp(errors, "ignore"))
5768                 known_errorHandler = 3;
5769             else if (!strcmp(errors, "xmlcharrefreplace"))
5770                 known_errorHandler = 4;
5771             else
5772                 known_errorHandler = 0;
5773         }
5774         switch (known_errorHandler) {
5775         case 1: /* strict */
5776             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5777             goto onError;
5778         case 2: /* replace */
5779             for (p = collstart; p < collend; ++p)
5780                 *output++ = '?';
5781             /* fall through */
5782         case 3: /* ignore */
5783             p = collend;
5784             break;
5785         case 4: /* xmlcharrefreplace */
5786             /* generate replacement (temporarily (mis)uses p) */
5787             for (p = collstart; p < collend; ++p)
5788                 output += sprintf(output, "&#%d;", (int)*p);
5789             p = collend;
5790             break;
5791         default:
5792             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5793                                                           encoding, reason, s, length, &exc,
5794                                                           collstart-s, collend-s, &newpos);
5795             if (repunicode == NULL)
5796                 goto onError;
5797             if (!PyUnicode_Check(repunicode)) {
5798                 /* Byte results not supported, since they have no decimal property. */
5799                 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5800                 Py_DECREF(repunicode);
5801                 goto onError;
5802             }
5803             /* generate replacement  */
5804             repsize = PyUnicode_GET_SIZE(repunicode);
5805             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5806                 Py_UNICODE ch = *uni2;
5807                 if (Py_UNICODE_ISSPACE(ch))
5808                     *output++ = ' ';
5809                 else {
5810                     decimal = Py_UNICODE_TODECIMAL(ch);
5811                     if (decimal >= 0)
5812                         *output++ = '0' + decimal;
5813                     else if (0 < ch && ch < 256)
5814                         *output++ = (char)ch;
5815                     else {
5816                         Py_DECREF(repunicode);
5817                         raise_encode_exception(&exc, encoding,
5818                                                s, length, collstart-s, collend-s, reason);
5819                         goto onError;
5820                     }
5821                 }
5822             }
5823             p = s + newpos;
5824             Py_DECREF(repunicode);
5825         }
5826     }
5827     /* 0-terminate the output string */
5828     *output++ = '\0';
5829     Py_XDECREF(exc);
5830     Py_XDECREF(errorHandler);
5831     return 0;
5832
5833   onError:
5834     Py_XDECREF(exc);
5835     Py_XDECREF(errorHandler);
5836     return -1;
5837 }
5838
5839 /* --- Helpers ------------------------------------------------------------ */
5840
5841 #include "stringlib/unicodedefs.h"
5842 #include "stringlib/fastsearch.h"
5843 #include "stringlib/count.h"
5844 /* Include _ParseTupleFinds from find.h */
5845 #define FROM_UNICODE
5846 #include "stringlib/find.h"
5847 #include "stringlib/partition.h"
5848
5849 #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5850 #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
5851 #include "stringlib/localeutil.h"
5852
5853 /* helper macro to fixup start/end slice values */
5854 #define FIX_START_END(obj)                      \
5855     if (start < 0)                              \
5856         start += (obj)->length;                 \
5857     if (start < 0)                              \
5858         start = 0;                              \
5859     if (end > (obj)->length)                    \
5860         end = (obj)->length;                    \
5861     if (end < 0)                                \
5862         end += (obj)->length;                   \
5863     if (end < 0)                                \
5864         end = 0;
5865
5866 Py_ssize_t PyUnicode_Count(PyObject *str,
5867                            PyObject *substr,
5868                            Py_ssize_t start,
5869                            Py_ssize_t end)
5870 {
5871     Py_ssize_t result;
5872     PyUnicodeObject* str_obj;
5873     PyUnicodeObject* sub_obj;
5874
5875     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5876     if (!str_obj)
5877         return -1;
5878     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5879     if (!sub_obj) {
5880         Py_DECREF(str_obj);
5881         return -1;
5882     }
5883
5884     FIX_START_END(str_obj);
5885
5886     result = stringlib_count(
5887         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5888         );
5889
5890     Py_DECREF(sub_obj);
5891     Py_DECREF(str_obj);
5892
5893     return result;
5894 }
5895
5896 Py_ssize_t PyUnicode_Find(PyObject *str,
5897                           PyObject *sub,
5898                           Py_ssize_t start,
5899                           Py_ssize_t end,
5900                           int direction)
5901 {
5902     Py_ssize_t result;
5903
5904     str = PyUnicode_FromObject(str);
5905     if (!str)
5906         return -2;
5907     sub = PyUnicode_FromObject(sub);
5908     if (!sub) {
5909         Py_DECREF(str);
5910         return -2;
5911     }
5912
5913     if (direction > 0)
5914         result = stringlib_find_slice(
5915             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5916             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5917             start, end
5918             );
5919     else
5920         result = stringlib_rfind_slice(
5921             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5922             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5923             start, end
5924             );
5925
5926     Py_DECREF(str);
5927     Py_DECREF(sub);
5928
5929     return result;
5930 }
5931
5932 static
5933 int tailmatch(PyUnicodeObject *self,
5934               PyUnicodeObject *substring,
5935               Py_ssize_t start,
5936               Py_ssize_t end,
5937               int direction)
5938 {
5939     if (substring->length == 0)
5940         return 1;
5941
5942     FIX_START_END(self);
5943
5944     end -= substring->length;
5945     if (end < start)
5946         return 0;
5947
5948     if (direction > 0) {
5949         if (Py_UNICODE_MATCH(self, end, substring))
5950             return 1;
5951     } else {
5952         if (Py_UNICODE_MATCH(self, start, substring))
5953             return 1;
5954     }
5955
5956     return 0;
5957 }
5958
5959 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5960                                PyObject *substr,
5961                                Py_ssize_t start,
5962                                Py_ssize_t end,
5963                                int direction)
5964 {
5965     Py_ssize_t result;
5966
5967     str = PyUnicode_FromObject(str);
5968     if (str == NULL)
5969         return -1;
5970     substr = PyUnicode_FromObject(substr);
5971     if (substr == NULL) {
5972         Py_DECREF(str);
5973         return -1;
5974     }
5975
5976     result = tailmatch((PyUnicodeObject *)str,
5977                        (PyUnicodeObject *)substr,
5978                        start, end, direction);
5979     Py_DECREF(str);
5980     Py_DECREF(substr);
5981     return result;
5982 }
5983
5984 /* Apply fixfct filter to the Unicode object self and return a
5985    reference to the modified object */
5986
5987 static
5988 PyObject *fixup(PyUnicodeObject *self,
5989                 int (*fixfct)(PyUnicodeObject *s))
5990 {
5991
5992     PyUnicodeObject *u;
5993
5994     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5995     if (u == NULL)
5996         return NULL;
5997
5998     Py_UNICODE_COPY(u->str, self->str, self->length);
5999
6000     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6001         /* fixfct should return TRUE if it modified the buffer. If
6002            FALSE, return a reference to the original buffer instead
6003            (to save space, not time) */
6004         Py_INCREF(self);
6005         Py_DECREF(u);
6006         return (PyObject*) self;
6007     }
6008     return (PyObject*) u;
6009 }
6010
6011 static
6012 int fixupper(PyUnicodeObject *self)
6013 {
6014     Py_ssize_t len = self->length;
6015     Py_UNICODE *s = self->str;
6016     int status = 0;
6017
6018     while (len-- > 0) {
6019         register Py_UNICODE ch;
6020
6021         ch = Py_UNICODE_TOUPPER(*s);
6022         if (ch != *s) {
6023             status = 1;
6024             *s = ch;
6025         }
6026         s++;
6027     }
6028
6029     return status;
6030 }
6031
6032 static
6033 int fixlower(PyUnicodeObject *self)
6034 {
6035     Py_ssize_t len = self->length;
6036     Py_UNICODE *s = self->str;
6037     int status = 0;
6038
6039     while (len-- > 0) {
6040         register Py_UNICODE ch;
6041
6042         ch = Py_UNICODE_TOLOWER(*s);
6043         if (ch != *s) {
6044             status = 1;
6045             *s = ch;
6046         }
6047         s++;
6048     }
6049
6050     return status;
6051 }
6052
6053 static
6054 int fixswapcase(PyUnicodeObject *self)
6055 {
6056     Py_ssize_t len = self->length;
6057     Py_UNICODE *s = self->str;
6058     int status = 0;
6059
6060     while (len-- > 0) {
6061         if (Py_UNICODE_ISUPPER(*s)) {
6062             *s = Py_UNICODE_TOLOWER(*s);
6063             status = 1;
6064         } else if (Py_UNICODE_ISLOWER(*s)) {
6065             *s = Py_UNICODE_TOUPPER(*s);
6066             status = 1;
6067         }
6068         s++;
6069     }
6070
6071     return status;
6072 }
6073
6074 static
6075 int fixcapitalize(PyUnicodeObject *self)
6076 {
6077     Py_ssize_t len = self->length;
6078     Py_UNICODE *s = self->str;
6079     int status = 0;
6080
6081     if (len == 0)
6082         return 0;
6083     if (Py_UNICODE_ISLOWER(*s)) {
6084         *s = Py_UNICODE_TOUPPER(*s);
6085         status = 1;
6086     }
6087     s++;
6088     while (--len > 0) {
6089         if (Py_UNICODE_ISUPPER(*s)) {
6090             *s = Py_UNICODE_TOLOWER(*s);
6091             status = 1;
6092         }
6093         s++;
6094     }
6095     return status;
6096 }
6097
6098 static
6099 int fixtitle(PyUnicodeObject *self)
6100 {
6101     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6102     register Py_UNICODE *e;
6103     int previous_is_cased;
6104
6105     /* Shortcut for single character strings */
6106     if (PyUnicode_GET_SIZE(self) == 1) {
6107         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6108         if (*p != ch) {
6109             *p = ch;
6110             return 1;
6111         }
6112         else
6113             return 0;
6114     }
6115
6116     e = p + PyUnicode_GET_SIZE(self);
6117     previous_is_cased = 0;
6118     for (; p < e; p++) {
6119         register const Py_UNICODE ch = *p;
6120
6121         if (previous_is_cased)
6122             *p = Py_UNICODE_TOLOWER(ch);
6123         else
6124             *p = Py_UNICODE_TOTITLE(ch);
6125
6126         if (Py_UNICODE_ISLOWER(ch) ||
6127             Py_UNICODE_ISUPPER(ch) ||
6128             Py_UNICODE_ISTITLE(ch))
6129             previous_is_cased = 1;
6130         else
6131             previous_is_cased = 0;
6132     }
6133     return 1;
6134 }
6135
6136 PyObject *
6137 PyUnicode_Join(PyObject *separator, PyObject *seq)
6138 {
6139     const Py_UNICODE blank = ' ';
6140     const Py_UNICODE *sep = &blank;
6141     Py_ssize_t seplen = 1;
6142     PyUnicodeObject *res = NULL; /* the result */
6143     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6144     PyObject *fseq;          /* PySequence_Fast(seq) */
6145     Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6146     PyObject **items;
6147     PyObject *item;
6148     Py_ssize_t sz, i;
6149
6150     fseq = PySequence_Fast(seq, "");
6151     if (fseq == NULL) {
6152         return NULL;
6153     }
6154
6155     /* NOTE: the following code can't call back into Python code,
6156      * so we are sure that fseq won't be mutated.
6157      */
6158
6159     seqlen = PySequence_Fast_GET_SIZE(fseq);
6160     /* If empty sequence, return u"". */
6161     if (seqlen == 0) {
6162         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6163         goto Done;
6164     }
6165     items = PySequence_Fast_ITEMS(fseq);
6166     /* If singleton sequence with an exact Unicode, return that. */
6167     if (seqlen == 1) {
6168         item = items[0];
6169         if (PyUnicode_CheckExact(item)) {
6170             Py_INCREF(item);
6171             res = (PyUnicodeObject *)item;
6172             goto Done;
6173         }
6174     }
6175     else {
6176         /* Set up sep and seplen */
6177         if (separator == NULL) {
6178             sep = &blank;
6179             seplen = 1;
6180         }
6181         else {
6182             if (!PyUnicode_Check(separator)) {
6183                 PyErr_Format(PyExc_TypeError,
6184                              "separator: expected str instance,"
6185                              " %.80s found",
6186                              Py_TYPE(separator)->tp_name);
6187                 goto onError;
6188             }
6189             sep = PyUnicode_AS_UNICODE(separator);
6190             seplen = PyUnicode_GET_SIZE(separator);
6191         }
6192     }
6193
6194     /* There are at least two things to join, or else we have a subclass
6195      * of str in the sequence.
6196      * Do a pre-pass to figure out the total amount of space we'll
6197      * need (sz), and see whether all argument are strings.
6198      */
6199     sz = 0;
6200     for (i = 0; i < seqlen; i++) {
6201         const Py_ssize_t old_sz = sz;
6202         item = items[i];
6203         if (!PyUnicode_Check(item)) {
6204             PyErr_Format(PyExc_TypeError,
6205                          "sequence item %zd: expected str instance,"
6206                          " %.80s found",
6207                          i, Py_TYPE(item)->tp_name);
6208             goto onError;
6209         }
6210         sz += PyUnicode_GET_SIZE(item);
6211         if (i != 0)
6212             sz += seplen;
6213         if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6214             PyErr_SetString(PyExc_OverflowError,
6215                             "join() result is too long for a Python string");
6216             goto onError;
6217         }
6218     }
6219
6220     res = _PyUnicode_New(sz);
6221     if (res == NULL)
6222         goto onError;
6223
6224     /* Catenate everything. */
6225     res_p = PyUnicode_AS_UNICODE(res);
6226     for (i = 0; i < seqlen; ++i) {
6227         Py_ssize_t itemlen;
6228         item = items[i];
6229         itemlen = PyUnicode_GET_SIZE(item);
6230         /* Copy item, and maybe the separator. */
6231         if (i) {
6232             Py_UNICODE_COPY(res_p, sep, seplen);
6233             res_p += seplen;
6234         }
6235         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6236         res_p += itemlen;
6237     }
6238
6239   Done:
6240     Py_DECREF(fseq);
6241     return (PyObject *)res;
6242
6243   onError:
6244     Py_DECREF(fseq);
6245     Py_XDECREF(res);
6246     return NULL;
6247 }
6248
6249 static
6250 PyUnicodeObject *pad(PyUnicodeObject *self,
6251                      Py_ssize_t left,
6252                      Py_ssize_t right,
6253                      Py_UNICODE fill)
6254 {
6255     PyUnicodeObject *u;
6256
6257     if (left < 0)
6258         left = 0;
6259     if (right < 0)
6260         right = 0;
6261
6262     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6263         Py_INCREF(self);
6264         return self;
6265     }
6266
6267     if (left > PY_SSIZE_T_MAX - self->length ||
6268         right > PY_SSIZE_T_MAX - (left + self->length)) {
6269         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6270         return NULL;
6271     }
6272     u = _PyUnicode_New(left + self->length + right);
6273     if (u) {
6274         if (left)
6275             Py_UNICODE_FILL(u->str, fill, left);
6276         Py_UNICODE_COPY(u->str + left, self->str, self->length);
6277         if (right)
6278             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6279     }
6280
6281     return u;
6282 }
6283
6284 #define SPLIT_APPEND(data, left, right)                                 \
6285     str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
6286     if (!str)                                                           \
6287         goto onError;                                                   \
6288     if (PyList_Append(list, str)) {                                     \
6289         Py_DECREF(str);                                                 \
6290         goto onError;                                                   \
6291     }                                                                   \
6292     else                                                                \
6293         Py_DECREF(str);
6294
6295 static
6296 PyObject *split_whitespace(PyUnicodeObject *self,
6297                            PyObject *list,
6298                            Py_ssize_t maxcount)
6299 {
6300     register Py_ssize_t i;
6301     register Py_ssize_t j;
6302     Py_ssize_t len = self->length;
6303     PyObject *str;
6304     register const Py_UNICODE *buf = self->str;
6305
6306     for (i = j = 0; i < len; ) {
6307         /* find a token */
6308         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6309             i++;
6310         j = i;
6311         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6312             i++;
6313         if (j < i) {
6314             if (maxcount-- <= 0)
6315                 break;
6316             SPLIT_APPEND(buf, j, i);
6317             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6318                 i++;
6319             j = i;
6320         }
6321     }
6322     if (j < len) {
6323         SPLIT_APPEND(buf, j, len);
6324     }
6325     return list;
6326
6327   onError:
6328     Py_DECREF(list);
6329     return NULL;
6330 }
6331
6332 PyObject *PyUnicode_Splitlines(PyObject *string,
6333                                int keepends)
6334 {
6335     register Py_ssize_t i;
6336     register Py_ssize_t j;
6337     Py_ssize_t len;
6338     PyObject *list;
6339     PyObject *str;
6340     Py_UNICODE *data;
6341
6342     string = PyUnicode_FromObject(string);
6343     if (string == NULL)
6344         return NULL;
6345     data = PyUnicode_AS_UNICODE(string);
6346     len = PyUnicode_GET_SIZE(string);
6347
6348     list = PyList_New(0);
6349     if (!list)
6350         goto onError;
6351
6352     for (i = j = 0; i < len; ) {
6353         Py_ssize_t eol;
6354
6355         /* Find a line and append it */
6356         while (i < len && !BLOOM_LINEBREAK(data[i]))
6357             i++;
6358
6359         /* Skip the line break reading CRLF as one line break */
6360         eol = i;
6361         if (i < len) {
6362             if (data[i] == '\r' && i + 1 < len &&
6363                 data[i+1] == '\n')
6364                 i += 2;
6365             else
6366                 i++;
6367             if (keepends)
6368                 eol = i;
6369         }
6370         SPLIT_APPEND(data, j, eol);
6371         j = i;
6372     }
6373     if (j < len) {
6374         SPLIT_APPEND(data, j, len);
6375     }
6376
6377     Py_DECREF(string);
6378     return list;
6379
6380   onError:
6381     Py_XDECREF(list);
6382     Py_DECREF(string);
6383     return NULL;
6384 }
6385
6386 static
6387 PyObject *split_char(PyUnicodeObject *self,
6388                      PyObject *list,
6389                      Py_UNICODE ch,
6390                      Py_ssize_t maxcount)
6391 {
6392     register Py_ssize_t i;
6393     register Py_ssize_t j;
6394     Py_ssize_t len = self->length;
6395     PyObject *str;
6396     register const Py_UNICODE *buf = self->str;
6397
6398     for (i = j = 0; i < len; ) {
6399         if (buf[i] == ch) {
6400             if (maxcount-- <= 0)
6401                 break;
6402             SPLIT_APPEND(buf, j, i);
6403             i = j = i + 1;
6404         } else
6405             i++;
6406     }
6407     if (j <= len) {
6408         SPLIT_APPEND(buf, j, len);
6409     }
6410     return list;
6411
6412   onError:
6413     Py_DECREF(list);
6414     return NULL;
6415 }
6416
6417 static
6418 PyObject *split_substring(PyUnicodeObject *self,
6419                           PyObject *list,
6420                           PyUnicodeObject *substring,
6421                           Py_ssize_t maxcount)
6422 {
6423     register Py_ssize_t i;
6424     register Py_ssize_t j;
6425     Py_ssize_t len = self->length;
6426     Py_ssize_t sublen = substring->length;
6427     PyObject *str;
6428
6429     for (i = j = 0; i <= len - sublen; ) {
6430         if (Py_UNICODE_MATCH(self, i, substring)) {
6431             if (maxcount-- <= 0)
6432                 break;
6433             SPLIT_APPEND(self->str, j, i);
6434             i = j = i + sublen;
6435         } else
6436             i++;
6437     }
6438     if (j <= len) {
6439         SPLIT_APPEND(self->str, j, len);
6440     }
6441     return list;
6442
6443   onError:
6444     Py_DECREF(list);
6445     return NULL;
6446 }
6447
6448 static
6449 PyObject *rsplit_whitespace(PyUnicodeObject *self,
6450                             PyObject *list,
6451                             Py_ssize_t maxcount)
6452 {
6453     register Py_ssize_t i;
6454     register Py_ssize_t j;
6455     Py_ssize_t len = self->length;
6456     PyObject *str;
6457     register const Py_UNICODE *buf = self->str;
6458
6459     for (i = j = len - 1; i >= 0; ) {
6460         /* find a token */
6461         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6462             i--;
6463         j = i;
6464         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6465             i--;
6466         if (j > i) {
6467             if (maxcount-- <= 0)
6468                 break;
6469             SPLIT_APPEND(buf, i + 1, j + 1);
6470             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6471                 i--;
6472             j = i;
6473         }
6474     }
6475     if (j >= 0) {
6476         SPLIT_APPEND(buf, 0, j + 1);
6477     }
6478     if (PyList_Reverse(list) < 0)
6479         goto onError;
6480     return list;
6481
6482   onError:
6483     Py_DECREF(list);
6484     return NULL;
6485 }
6486
6487 static
6488 PyObject *rsplit_char(PyUnicodeObject *self,
6489                       PyObject *list,
6490                       Py_UNICODE ch,
6491                       Py_ssize_t maxcount)
6492 {
6493     register Py_ssize_t i;
6494     register Py_ssize_t j;
6495     Py_ssize_t len = self->length;
6496     PyObject *str;
6497     register const Py_UNICODE *buf = self->str;
6498
6499     for (i = j = len - 1; i >= 0; ) {
6500         if (buf[i] == ch) {
6501             if (maxcount-- <= 0)
6502                 break;
6503             SPLIT_APPEND(buf, i + 1, j + 1);
6504             j = i = i - 1;
6505         } else
6506             i--;
6507     }
6508     if (j >= -1) {
6509         SPLIT_APPEND(buf, 0, j + 1);
6510     }
6511     if (PyList_Reverse(list) < 0)
6512         goto onError;
6513     return list;
6514
6515   onError:
6516     Py_DECREF(list);
6517     return NULL;
6518 }
6519
6520 static
6521 PyObject *rsplit_substring(PyUnicodeObject *self,
6522                            PyObject *list,
6523                            PyUnicodeObject *substring,
6524                            Py_ssize_t maxcount)
6525 {
6526     register Py_ssize_t i;
6527     register Py_ssize_t j;
6528     Py_ssize_t len = self->length;
6529     Py_ssize_t sublen = substring->length;
6530     PyObject *str;
6531
6532     for (i = len - sublen, j = len; i >= 0; ) {
6533         if (Py_UNICODE_MATCH(self, i, substring)) {
6534             if (maxcount-- <= 0)
6535                 break;
6536             SPLIT_APPEND(self->str, i + sublen, j);
6537             j = i;
6538             i -= sublen;
6539         } else
6540             i--;
6541     }
6542     if (j >= 0) {
6543         SPLIT_APPEND(self->str, 0, j);
6544     }
6545     if (PyList_Reverse(list) < 0)
6546         goto onError;
6547     return list;
6548
6549   onError:
6550     Py_DECREF(list);
6551     return NULL;
6552 }
6553
6554 #undef SPLIT_APPEND
6555
6556 static
6557 PyObject *split(PyUnicodeObject *self,
6558                 PyUnicodeObject *substring,
6559                 Py_ssize_t maxcount)
6560 {
6561     PyObject *list;
6562
6563     if (maxcount < 0)
6564         maxcount = PY_SSIZE_T_MAX;
6565
6566     list = PyList_New(0);
6567     if (!list)
6568         return NULL;
6569
6570     if (substring == NULL)
6571         return split_whitespace(self,list,maxcount);
6572
6573     else if (substring->length == 1)
6574         return split_char(self,list,substring->str[0],maxcount);
6575
6576     else if (substring->length == 0) {
6577         Py_DECREF(list);
6578         PyErr_SetString(PyExc_ValueError, "empty separator");
6579         return NULL;
6580     }
6581     else
6582         return split_substring(self,list,substring,maxcount);
6583 }
6584
6585 static
6586 PyObject *rsplit(PyUnicodeObject *self,
6587                  PyUnicodeObject *substring,
6588                  Py_ssize_t maxcount)
6589 {
6590     PyObject *list;
6591
6592     if (maxcount < 0)
6593         maxcount = PY_SSIZE_T_MAX;
6594
6595     list = PyList_New(0);
6596     if (!list)
6597         return NULL;
6598
6599     if (substring == NULL)
6600         return rsplit_whitespace(self,list,maxcount);
6601
6602     else if (substring->length == 1)
6603         return rsplit_char(self,list,substring->str[0],maxcount);
6604
6605     else if (substring->length == 0) {
6606         Py_DECREF(list);
6607         PyErr_SetString(PyExc_ValueError, "empty separator");
6608         return NULL;
6609     }
6610     else
6611         return rsplit_substring(self,list,substring,maxcount);
6612 }
6613
6614 static
6615 PyObject *replace(PyUnicodeObject *self,
6616                   PyUnicodeObject *str1,
6617                   PyUnicodeObject *str2,
6618                   Py_ssize_t maxcount)
6619 {
6620     PyUnicodeObject *u;
6621
6622     if (maxcount < 0)
6623         maxcount = PY_SSIZE_T_MAX;
6624
6625     if (str1->length == str2->length) {
6626         /* same length */
6627         Py_ssize_t i;
6628         if (str1->length == 1) {
6629             /* replace characters */
6630             Py_UNICODE u1, u2;
6631             if (!findchar(self->str, self->length, str1->str[0]))
6632                 goto nothing;
6633             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6634             if (!u)
6635                 return NULL;
6636             Py_UNICODE_COPY(u->str, self->str, self->length);
6637             u1 = str1->str[0];
6638             u2 = str2->str[0];
6639             for (i = 0; i < u->length; i++)
6640                 if (u->str[i] == u1) {
6641                     if (--maxcount < 0)
6642                         break;
6643                     u->str[i] = u2;
6644                 }
6645         } else {
6646             i = fastsearch(
6647                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
6648                 );
6649             if (i < 0)
6650                 goto nothing;
6651             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6652             if (!u)
6653                 return NULL;
6654             Py_UNICODE_COPY(u->str, self->str, self->length);
6655             while (i <= self->length - str1->length)
6656                 if (Py_UNICODE_MATCH(self, i, str1)) {
6657                     if (--maxcount < 0)
6658                         break;
6659                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6660                     i += str1->length;
6661                 } else
6662                     i++;
6663         }
6664     } else {
6665
6666         Py_ssize_t n, i, j, e;
6667         Py_ssize_t product, new_size, delta;
6668         Py_UNICODE *p;
6669
6670         /* replace strings */
6671         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6672         if (n > maxcount)
6673             n = maxcount;
6674         if (n == 0)
6675             goto nothing;
6676         /* new_size = self->length + n * (str2->length - str1->length)); */
6677         delta = (str2->length - str1->length);
6678         if (delta == 0) {
6679             new_size = self->length;
6680         } else {
6681             product = n * (str2->length - str1->length);
6682             if ((product / (str2->length - str1->length)) != n) {
6683                 PyErr_SetString(PyExc_OverflowError,
6684                                 "replace string is too long");
6685                 return NULL;
6686             }
6687             new_size = self->length + product;
6688             if (new_size < 0) {
6689                 PyErr_SetString(PyExc_OverflowError,
6690                                 "replace string is too long");
6691                 return NULL;
6692             }
6693         }
6694         u = _PyUnicode_New(new_size);
6695         if (!u)
6696             return NULL;
6697         i = 0;
6698         p = u->str;
6699         e = self->length - str1->length;
6700         if (str1->length > 0) {
6701             while (n-- > 0) {
6702                 /* look for next match */
6703                 j = i;
6704                 while (j <= e) {
6705                     if (Py_UNICODE_MATCH(self, j, str1))
6706                         break;
6707                     j++;
6708                 }
6709                 if (j > i) {
6710                     if (j > e)
6711                         break;
6712                     /* copy unchanged part [i:j] */
6713                     Py_UNICODE_COPY(p, self->str+i, j-i);
6714                     p += j - i;
6715                 }
6716                 /* copy substitution string */
6717                 if (str2->length > 0) {
6718                     Py_UNICODE_COPY(p, str2->str, str2->length);
6719                     p += str2->length;
6720                 }
6721                 i = j + str1->length;
6722             }
6723             if (i < self->length)
6724                 /* copy tail [i:] */
6725                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6726         } else {
6727             /* interleave */
6728             while (n > 0) {
6729                 Py_UNICODE_COPY(p, str2->str, str2->length);
6730                 p += str2->length;
6731                 if (--n <= 0)
6732                     break;
6733                 *p++ = self->str[i++];
6734             }
6735             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6736         }
6737     }
6738     return (PyObject *) u;
6739
6740   nothing:
6741     /* nothing to replace; return original string (when possible) */
6742     if (PyUnicode_CheckExact(self)) {
6743         Py_INCREF(self);
6744         return (PyObject *) self;
6745     }
6746     return PyUnicode_FromUnicode(self->str, self->length);
6747 }
6748
6749 /* --- Unicode Object Methods --------------------------------------------- */
6750
6751 PyDoc_STRVAR(title__doc__,
6752              "S.title() -> str\n\
6753 \n\
6754 Return a titlecased version of S, i.e. words start with title case\n\
6755 characters, all remaining cased characters have lower case.");
6756
6757 static PyObject*
6758 unicode_title(PyUnicodeObject *self)
6759 {
6760     return fixup(self, fixtitle);
6761 }
6762
6763 PyDoc_STRVAR(capitalize__doc__,
6764              "S.capitalize() -> str\n\
6765 \n\
6766 Return a capitalized version of S, i.e. make the first character\n\
6767 have upper case and the rest lower case.");
6768
6769 static PyObject*
6770 unicode_capitalize(PyUnicodeObject *self)
6771 {
6772     return fixup(self, fixcapitalize);
6773 }
6774
6775 #if 0
6776 PyDoc_STRVAR(capwords__doc__,
6777              "S.capwords() -> str\n\
6778 \n\
6779 Apply .capitalize() to all words in S and return the result with\n\
6780 normalized whitespace (all whitespace strings are replaced by ' ').");
6781
6782 static PyObject*
6783 unicode_capwords(PyUnicodeObject *self)
6784 {
6785     PyObject *list;
6786     PyObject *item;
6787     Py_ssize_t i;
6788
6789     /* Split into words */
6790     list = split(self, NULL, -1);
6791     if (!list)
6792         return NULL;
6793
6794     /* Capitalize each word */
6795     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6796         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6797                      fixcapitalize);
6798         if (item == NULL)
6799             goto onError;
6800         Py_DECREF(PyList_GET_ITEM(list, i));
6801         PyList_SET_ITEM(list, i, item);
6802     }
6803
6804     /* Join the words to form a new string */
6805     item = PyUnicode_Join(NULL, list);
6806
6807   onError:
6808     Py_DECREF(list);
6809     return (PyObject *)item;
6810 }
6811 #endif
6812
6813 /* Argument converter.  Coerces to a single unicode character */
6814
6815 static int
6816 convert_uc(PyObject *obj, void *addr)
6817 {
6818     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6819     PyObject *uniobj;
6820     Py_UNICODE *unistr;
6821
6822     uniobj = PyUnicode_FromObject(obj);
6823     if (uniobj == NULL) {
6824         PyErr_SetString(PyExc_TypeError,
6825                         "The fill character cannot be converted to Unicode");
6826         return 0;
6827     }
6828     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6829         PyErr_SetString(PyExc_TypeError,
6830                         "The fill character must be exactly one character long");
6831         Py_DECREF(uniobj);
6832         return 0;
6833     }
6834     unistr = PyUnicode_AS_UNICODE(uniobj);
6835     *fillcharloc = unistr[0];
6836     Py_DECREF(uniobj);
6837     return 1;
6838 }
6839
6840 PyDoc_STRVAR(center__doc__,
6841              "S.center(width[, fillchar]) -> str\n\
6842 \n\
6843 Return S centered in a string of length width. Padding is\n\
6844 done using the specified fill character (default is a space)");
6845
6846 static PyObject *
6847 unicode_center(PyUnicodeObject *self, PyObject *args)
6848 {
6849     Py_ssize_t marg, left;
6850     Py_ssize_t width;
6851     Py_UNICODE fillchar = ' ';
6852
6853     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6854         return NULL;
6855
6856     if (self->length >= width && PyUnicode_CheckExact(self)) {
6857         Py_INCREF(self);
6858         return (PyObject*) self;
6859     }
6860
6861     marg = width - self->length;
6862     left = marg / 2 + (marg & width & 1);
6863
6864     return (PyObject*) pad(self, left, marg - left, fillchar);
6865 }
6866
6867 #if 0
6868
6869 /* This code should go into some future Unicode collation support
6870    module. The basic comparison should compare ordinals on a naive
6871    basis (this is what Java does and thus Jython too). */
6872
6873 /* speedy UTF-16 code point order comparison */
6874 /* gleaned from: */
6875 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6876
6877 static short utf16Fixup[32] =
6878 {
6879     0, 0, 0, 0, 0, 0, 0, 0,
6880     0, 0, 0, 0, 0, 0, 0, 0,
6881     0, 0, 0, 0, 0, 0, 0, 0,
6882     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6883 };
6884
6885 static int
6886 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6887 {
6888     Py_ssize_t len1, len2;
6889
6890     Py_UNICODE *s1 = str1->str;
6891     Py_UNICODE *s2 = str2->str;
6892
6893     len1 = str1->length;
6894     len2 = str2->length;
6895
6896     while (len1 > 0 && len2 > 0) {
6897         Py_UNICODE c1, c2;
6898
6899         c1 = *s1++;
6900         c2 = *s2++;
6901
6902         if (c1 > (1<<11) * 26)
6903             c1 += utf16Fixup[c1>>11];
6904         if (c2 > (1<<11) * 26)
6905             c2 += utf16Fixup[c2>>11];
6906         /* now c1 and c2 are in UTF-32-compatible order */
6907
6908         if (c1 != c2)
6909             return (c1 < c2) ? -1 : 1;
6910
6911         len1--; len2--;
6912     }
6913
6914     return (len1 < len2) ? -1 : (len1 != len2);
6915 }
6916
6917 #else
6918
6919 static int
6920 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6921 {
6922     register Py_ssize_t len1, len2;
6923
6924     Py_UNICODE *s1 = str1->str;
6925     Py_UNICODE *s2 = str2->str;
6926
6927     len1 = str1->length;
6928     len2 = str2->length;
6929
6930     while (len1 > 0 && len2 > 0) {
6931         Py_UNICODE c1, c2;
6932
6933         c1 = *s1++;
6934         c2 = *s2++;
6935
6936         if (c1 != c2)
6937             return (c1 < c2) ? -1 : 1;
6938
6939         len1--; len2--;
6940     }
6941
6942     return (len1 < len2) ? -1 : (len1 != len2);
6943 }
6944
6945 #endif
6946
6947 int PyUnicode_Compare(PyObject *left,
6948                       PyObject *right)
6949 {
6950     if (PyUnicode_Check(left) && PyUnicode_Check(right))
6951         return unicode_compare((PyUnicodeObject *)left,
6952                                (PyUnicodeObject *)right);
6953     PyErr_Format(PyExc_TypeError,
6954                  "Can't compare %.100s and %.100s",
6955                  left->ob_type->tp_name,
6956                  right->ob_type->tp_name);
6957     return -1;
6958 }
6959
6960 int
6961 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6962 {
6963     int i;
6964     Py_UNICODE *id;
6965     assert(PyUnicode_Check(uni));
6966     id = PyUnicode_AS_UNICODE(uni);
6967     /* Compare Unicode string and source character set string */
6968     for (i = 0; id[i] && str[i]; i++)
6969         if (id[i] != str[i])
6970             return ((int)id[i] < (int)str[i]) ? -1 : 1;
6971     /* This check keeps Python strings that end in '\0' from comparing equal
6972      to C strings identical up to that point. */
6973     if (PyUnicode_GET_SIZE(uni) != i)
6974         /* We'll say the Python string is longer. */
6975         return 1;
6976     if (id[i])
6977         return 1; /* uni is longer */
6978     if (str[i])
6979         return -1; /* str is longer */
6980     return 0;
6981 }
6982
6983
6984 #define TEST_COND(cond)                         \
6985     ((cond) ? Py_True : Py_False)
6986
6987 PyObject *PyUnicode_RichCompare(PyObject *left,
6988                                 PyObject *right,
6989                                 int op)
6990 {
6991     int result;
6992
6993     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6994         PyObject *v;
6995         if (((PyUnicodeObject *) left)->length !=
6996             ((PyUnicodeObject *) right)->length) {
6997             if (op == Py_EQ) {
6998                 Py_INCREF(Py_False);
6999                 return Py_False;
7000             }
7001             if (op == Py_NE) {
7002                 Py_INCREF(Py_True);
7003                 return Py_True;
7004             }
7005         }
7006         if (left == right)
7007             result = 0;
7008         else
7009             result = unicode_compare((PyUnicodeObject *)left,
7010                                      (PyUnicodeObject *)right);
7011
7012         /* Convert the return value to a Boolean */
7013         switch (op) {
7014         case Py_EQ:
7015             v = TEST_COND(result == 0);
7016             break;
7017         case Py_NE:
7018             v = TEST_COND(result != 0);
7019             break;
7020         case Py_LE:
7021             v = TEST_COND(result <= 0);
7022             break;
7023         case Py_GE:
7024             v = TEST_COND(result >= 0);
7025             break;
7026         case Py_LT:
7027             v = TEST_COND(result == -1);
7028             break;
7029         case Py_GT:
7030             v = TEST_COND(result == 1);
7031             break;
7032         default:
7033             PyErr_BadArgument();
7034             return NULL;
7035         }
7036         Py_INCREF(v);
7037         return v;
7038     }
7039
7040     Py_INCREF(Py_NotImplemented);
7041     return Py_NotImplemented;
7042 }
7043
7044 int PyUnicode_Contains(PyObject *container,
7045                        PyObject *element)
7046 {
7047     PyObject *str, *sub;
7048     int result;
7049
7050     /* Coerce the two arguments */
7051     sub = PyUnicode_FromObject(element);
7052     if (!sub) {
7053         PyErr_Format(PyExc_TypeError,
7054                      "'in <string>' requires string as left operand, not %s",
7055                      element->ob_type->tp_name);
7056         return -1;
7057     }
7058
7059     str = PyUnicode_FromObject(container);
7060     if (!str) {
7061         Py_DECREF(sub);
7062         return -1;
7063     }
7064
7065     result = stringlib_contains_obj(str, sub);
7066
7067     Py_DECREF(str);
7068     Py_DECREF(sub);
7069
7070     return result;
7071 }
7072
7073 /* Concat to string or Unicode object giving a new Unicode object. */
7074
7075 PyObject *PyUnicode_Concat(PyObject *left,
7076                            PyObject *right)
7077 {
7078     PyUnicodeObject *u = NULL, *v = NULL, *w;
7079
7080     /* Coerce the two arguments */
7081     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7082     if (u == NULL)
7083         goto onError;
7084     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7085     if (v == NULL)
7086         goto onError;
7087
7088     /* Shortcuts */
7089     if (v == unicode_empty) {
7090         Py_DECREF(v);
7091         return (PyObject *)u;
7092     }
7093     if (u == unicode_empty) {
7094         Py_DECREF(u);
7095         return (PyObject *)v;
7096     }
7097
7098     /* Concat the two Unicode strings */
7099     w = _PyUnicode_New(u->length + v->length);
7100     if (w == NULL)
7101         goto onError;
7102     Py_UNICODE_COPY(w->str, u->str, u->length);
7103     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7104
7105     Py_DECREF(u);
7106     Py_DECREF(v);
7107     return (PyObject *)w;
7108
7109   onError:
7110     Py_XDECREF(u);
7111     Py_XDECREF(v);
7112     return NULL;
7113 }
7114
7115 void
7116 PyUnicode_Append(PyObject **pleft, PyObject *right)
7117 {
7118     PyObject *new;
7119     if (*pleft == NULL)
7120         return;
7121     if (right == NULL || !PyUnicode_Check(*pleft)) {
7122         Py_DECREF(*pleft);
7123         *pleft = NULL;
7124         return;
7125     }
7126     new = PyUnicode_Concat(*pleft, right);
7127     Py_DECREF(*pleft);
7128     *pleft = new;
7129 }
7130
7131 void
7132 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7133 {
7134     PyUnicode_Append(pleft, right);
7135     Py_XDECREF(right);
7136 }
7137
7138 PyDoc_STRVAR(count__doc__,
7139              "S.count(sub[, start[, end]]) -> int\n\
7140 \n\
7141 Return the number of non-overlapping occurrences of substring sub in\n\
7142 string S[start:end].  Optional arguments start and end are\n\
7143 interpreted as in slice notation.");
7144
7145 static PyObject *
7146 unicode_count(PyUnicodeObject *self, PyObject *args)
7147 {
7148     PyUnicodeObject *substring;
7149     Py_ssize_t start = 0;
7150     Py_ssize_t end = PY_SSIZE_T_MAX;
7151     PyObject *result;
7152
7153     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
7154                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7155         return NULL;
7156
7157     substring = (PyUnicodeObject *)PyUnicode_FromObject(
7158         (PyObject *)substring);
7159     if (substring == NULL)
7160         return NULL;
7161
7162     FIX_START_END(self);
7163
7164     result = PyLong_FromSsize_t(
7165         stringlib_count(self->str + start, end - start,
7166                         substring->str, substring->length)
7167         );
7168
7169     Py_DECREF(substring);
7170
7171     return result;
7172 }
7173
7174 PyDoc_STRVAR(encode__doc__,
7175              "S.encode([encoding[, errors]]) -> bytes\n\
7176 \n\
7177 Encode S using the codec registered for encoding. encoding defaults\n\
7178 to the default encoding. errors may be given to set a different error\n\
7179 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7180 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7181 'xmlcharrefreplace' as well as any other name registered with\n\
7182 codecs.register_error that can handle UnicodeEncodeErrors.");
7183
7184 static PyObject *
7185 unicode_encode(PyUnicodeObject *self, PyObject *args)
7186 {
7187     char *encoding = NULL;
7188     char *errors = NULL;
7189     PyObject *v;
7190
7191     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7192         return NULL;
7193     v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7194     if (v == NULL)
7195         goto onError;
7196     if (!PyBytes_Check(v)) {
7197         PyErr_Format(PyExc_TypeError,
7198                      "encoder did not return a bytes object "
7199                      "(type=%.400s)",
7200                      Py_TYPE(v)->tp_name);
7201         Py_DECREF(v);
7202         return NULL;
7203     }
7204     return v;
7205
7206   onError:
7207     return NULL;
7208 }
7209
7210 PyDoc_STRVAR(expandtabs__doc__,
7211              "S.expandtabs([tabsize]) -> str\n\
7212 \n\
7213 Return a copy of S where all tab characters are expanded using spaces.\n\
7214 If tabsize is not given, a tab size of 8 characters is assumed.");
7215
7216 static PyObject*
7217 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7218 {
7219     Py_UNICODE *e;
7220     Py_UNICODE *p;
7221     Py_UNICODE *q;
7222     Py_UNICODE *qe;
7223     Py_ssize_t i, j, incr;
7224     PyUnicodeObject *u;
7225     int tabsize = 8;
7226
7227     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7228         return NULL;
7229
7230     /* First pass: determine size of output string */
7231     i = 0; /* chars up to and including most recent \n or \r */
7232     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7233     e = self->str + self->length; /* end of input */
7234     for (p = self->str; p < e; p++)
7235         if (*p == '\t') {
7236             if (tabsize > 0) {
7237                 incr = tabsize - (j % tabsize); /* cannot overflow */
7238                 if (j > PY_SSIZE_T_MAX - incr)
7239                     goto overflow1;
7240                 j += incr;
7241             }
7242         }
7243         else {
7244             if (j > PY_SSIZE_T_MAX - 1)
7245                 goto overflow1;
7246             j++;
7247             if (*p == '\n' || *p == '\r') {
7248                 if (i > PY_SSIZE_T_MAX - j)
7249                     goto overflow1;
7250                 i += j;
7251                 j = 0;
7252             }
7253         }
7254
7255     if (i > PY_SSIZE_T_MAX - j)
7256         goto overflow1;
7257
7258     /* Second pass: create output string and fill it */
7259     u = _PyUnicode_New(i + j);
7260     if (!u)
7261         return NULL;
7262
7263     j = 0; /* same as in first pass */
7264     q = u->str; /* next output char */
7265     qe = u->str + u->length; /* end of output */
7266
7267     for (p = self->str; p < e; p++)
7268         if (*p == '\t') {
7269             if (tabsize > 0) {
7270                 i = tabsize - (j % tabsize);
7271                 j += i;
7272                 while (i--) {
7273                     if (q >= qe)
7274                         goto overflow2;
7275                     *q++ = ' ';
7276                 }
7277             }
7278         }
7279         else {
7280             if (q >= qe)
7281                 goto overflow2;
7282             *q++ = *p;
7283             j++;
7284             if (*p == '\n' || *p == '\r')
7285                 j = 0;
7286         }
7287
7288     return (PyObject*) u;
7289
7290   overflow2:
7291     Py_DECREF(u);
7292   overflow1:
7293     PyErr_SetString(PyExc_OverflowError, "new string is too long");
7294     return NULL;
7295 }
7296
7297 PyDoc_STRVAR(find__doc__,
7298              "S.find(sub[, start[, end]]) -> int\n\
7299 \n\
7300 Return the lowest index in S where substring sub is found,\n\
7301 such that sub is contained within s[start:end].  Optional\n\
7302 arguments start and end are interpreted as in slice notation.\n\
7303 \n\
7304 Return -1 on failure.");
7305
7306 static PyObject *
7307 unicode_find(PyUnicodeObject *self, PyObject *args)
7308 {
7309     PyObject *substring;
7310     Py_ssize_t start;
7311     Py_ssize_t end;
7312     Py_ssize_t result;
7313
7314     if (!_ParseTupleFinds(args, &substring, &start, &end))
7315         return NULL;
7316
7317     result = stringlib_find_slice(
7318         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7319         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7320         start, end
7321         );
7322
7323     Py_DECREF(substring);
7324
7325     return PyLong_FromSsize_t(result);
7326 }
7327
7328 static PyObject *
7329 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7330 {
7331     if (index < 0 || index >= self->length) {
7332         PyErr_SetString(PyExc_IndexError, "string index out of range");
7333         return NULL;
7334     }
7335
7336     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7337 }
7338
7339 /* Believe it or not, this produces the same value for ASCII strings
7340    as string_hash(). */
7341 static long
7342 unicode_hash(PyUnicodeObject *self)
7343 {
7344     Py_ssize_t len;
7345     Py_UNICODE *p;
7346     long x;
7347
7348     if (self->hash != -1)
7349         return self->hash;
7350     len = Py_SIZE(self);
7351     p = self->str;
7352     x = *p << 7;
7353     while (--len >= 0)
7354         x = (1000003*x) ^ *p++;
7355     x ^= Py_SIZE(self);
7356     if (x == -1)
7357         x = -2;
7358     self->hash = x;
7359     return x;
7360 }
7361
7362 PyDoc_STRVAR(index__doc__,
7363              "S.index(sub[, start[, end]]) -> int\n\
7364 \n\
7365 Like S.find() but raise ValueError when the substring is not found.");
7366
7367 static PyObject *
7368 unicode_index(PyUnicodeObject *self, PyObject *args)
7369 {
7370     Py_ssize_t result;
7371     PyObject *substring;
7372     Py_ssize_t start;
7373     Py_ssize_t end;
7374
7375     if (!_ParseTupleFinds(args, &substring, &start, &end))
7376         return NULL;
7377
7378     result = stringlib_find_slice(
7379         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7380         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7381         start, end
7382         );
7383
7384     Py_DECREF(substring);
7385
7386     if (result < 0) {
7387         PyErr_SetString(PyExc_ValueError, "substring not found");
7388         return NULL;
7389     }
7390
7391     return PyLong_FromSsize_t(result);
7392 }
7393
7394 PyDoc_STRVAR(islower__doc__,
7395              "S.islower() -> bool\n\
7396 \n\
7397 Return True if all cased characters in S are lowercase and there is\n\
7398 at least one cased character in S, False otherwise.");
7399
7400 static PyObject*
7401 unicode_islower(PyUnicodeObject *self)
7402 {
7403     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7404     register const Py_UNICODE *e;
7405     int cased;
7406
7407     /* Shortcut for single character strings */
7408     if (PyUnicode_GET_SIZE(self) == 1)
7409         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7410
7411     /* Special case for empty strings */
7412     if (PyUnicode_GET_SIZE(self) == 0)
7413         return PyBool_FromLong(0);
7414
7415     e = p + PyUnicode_GET_SIZE(self);
7416     cased = 0;
7417     for (; p < e; p++) {
7418         register const Py_UNICODE ch = *p;
7419
7420         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7421             return PyBool_FromLong(0);
7422         else if (!cased && Py_UNICODE_ISLOWER(ch))
7423             cased = 1;
7424     }
7425     return PyBool_FromLong(cased);
7426 }
7427
7428 PyDoc_STRVAR(isupper__doc__,
7429              "S.isupper() -> bool\n\
7430 \n\
7431 Return True if all cased characters in S are uppercase and there is\n\
7432 at least one cased character in S, False otherwise.");
7433
7434 static PyObject*
7435 unicode_isupper(PyUnicodeObject *self)
7436 {
7437     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7438     register const Py_UNICODE *e;
7439     int cased;
7440
7441     /* Shortcut for single character strings */
7442     if (PyUnicode_GET_SIZE(self) == 1)
7443         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7444
7445     /* Special case for empty strings */
7446     if (PyUnicode_GET_SIZE(self) == 0)
7447         return PyBool_FromLong(0);
7448
7449     e = p + PyUnicode_GET_SIZE(self);
7450     cased = 0;
7451     for (; p < e; p++) {
7452         register const Py_UNICODE ch = *p;
7453
7454         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7455             return PyBool_FromLong(0);
7456         else if (!cased && Py_UNICODE_ISUPPER(ch))
7457             cased = 1;
7458     }
7459     return PyBool_FromLong(cased);
7460 }
7461
7462 PyDoc_STRVAR(istitle__doc__,
7463              "S.istitle() -> bool\n\
7464 \n\
7465 Return True if S is a titlecased string and there is at least one\n\
7466 character in S, i.e. upper- and titlecase characters may only\n\
7467 follow uncased characters and lowercase characters only cased ones.\n\
7468 Return False otherwise.");
7469
7470 static PyObject*
7471 unicode_istitle(PyUnicodeObject *self)
7472 {
7473     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7474     register const Py_UNICODE *e;
7475     int cased, previous_is_cased;
7476
7477     /* Shortcut for single character strings */
7478     if (PyUnicode_GET_SIZE(self) == 1)
7479         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7480                                (Py_UNICODE_ISUPPER(*p) != 0));
7481
7482     /* Special case for empty strings */
7483     if (PyUnicode_GET_SIZE(self) == 0)
7484         return PyBool_FromLong(0);
7485
7486     e = p + PyUnicode_GET_SIZE(self);
7487     cased = 0;
7488     previous_is_cased = 0;
7489     for (; p < e; p++) {
7490         register const Py_UNICODE ch = *p;
7491
7492         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7493             if (previous_is_cased)
7494                 return PyBool_FromLong(0);
7495             previous_is_cased = 1;
7496             cased = 1;
7497         }
7498         else if (Py_UNICODE_ISLOWER(ch)) {
7499             if (!previous_is_cased)
7500                 return PyBool_FromLong(0);
7501             previous_is_cased = 1;
7502             cased = 1;
7503         }
7504         else
7505             previous_is_cased = 0;
7506     }
7507     return PyBool_FromLong(cased);
7508 }
7509
7510 PyDoc_STRVAR(isspace__doc__,
7511              "S.isspace() -> bool\n\
7512 \n\
7513 Return True if all characters in S are whitespace\n\
7514 and there is at least one character in S, False otherwise.");
7515
7516 static PyObject*
7517 unicode_isspace(PyUnicodeObject *self)
7518 {
7519     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7520     register const Py_UNICODE *e;
7521
7522     /* Shortcut for single character strings */
7523     if (PyUnicode_GET_SIZE(self) == 1 &&
7524         Py_UNICODE_ISSPACE(*p))
7525         return PyBool_FromLong(1);
7526
7527     /* Special case for empty strings */
7528     if (PyUnicode_GET_SIZE(self) == 0)
7529         return PyBool_FromLong(0);
7530
7531     e = p + PyUnicode_GET_SIZE(self);
7532     for (; p < e; p++) {
7533         if (!Py_UNICODE_ISSPACE(*p))
7534             return PyBool_FromLong(0);
7535     }
7536     return PyBool_FromLong(1);
7537 }
7538
7539 PyDoc_STRVAR(isalpha__doc__,
7540              "S.isalpha() -> bool\n\
7541 \n\
7542 Return True if all characters in S are alphabetic\n\
7543 and there is at least one character in S, False otherwise.");
7544
7545 static PyObject*
7546 unicode_isalpha(PyUnicodeObject *self)
7547 {
7548     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7549     register const Py_UNICODE *e;
7550
7551     /* Shortcut for single character strings */
7552     if (PyUnicode_GET_SIZE(self) == 1 &&
7553         Py_UNICODE_ISALPHA(*p))
7554         return PyBool_FromLong(1);
7555
7556     /* Special case for empty strings */
7557     if (PyUnicode_GET_SIZE(self) == 0)
7558         return PyBool_FromLong(0);
7559
7560     e = p + PyUnicode_GET_SIZE(self);
7561     for (; p < e; p++) {
7562         if (!Py_UNICODE_ISALPHA(*p))
7563             return PyBool_FromLong(0);
7564     }
7565     return PyBool_FromLong(1);
7566 }
7567
7568 PyDoc_STRVAR(isalnum__doc__,
7569              "S.isalnum() -> bool\n\
7570 \n\
7571 Return True if all characters in S are alphanumeric\n\
7572 and there is at least one character in S, False otherwise.");
7573
7574 static PyObject*
7575 unicode_isalnum(PyUnicodeObject *self)
7576 {
7577     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7578     register const Py_UNICODE *e;
7579
7580     /* Shortcut for single character strings */
7581     if (PyUnicode_GET_SIZE(self) == 1 &&
7582         Py_UNICODE_ISALNUM(*p))
7583         return PyBool_FromLong(1);
7584
7585     /* Special case for empty strings */
7586     if (PyUnicode_GET_SIZE(self) == 0)
7587         return PyBool_FromLong(0);
7588
7589     e = p + PyUnicode_GET_SIZE(self);
7590     for (; p < e; p++) {
7591         if (!Py_UNICODE_ISALNUM(*p))
7592             return PyBool_FromLong(0);
7593     }
7594     return PyBool_FromLong(1);
7595 }
7596
7597 PyDoc_STRVAR(isdecimal__doc__,
7598              "S.isdecimal() -> bool\n\
7599 \n\
7600 Return True if there are only decimal characters in S,\n\
7601 False otherwise.");
7602
7603 static PyObject*
7604 unicode_isdecimal(PyUnicodeObject *self)
7605 {
7606     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7607     register const Py_UNICODE *e;
7608
7609     /* Shortcut for single character strings */
7610     if (PyUnicode_GET_SIZE(self) == 1 &&
7611         Py_UNICODE_ISDECIMAL(*p))
7612         return PyBool_FromLong(1);
7613
7614     /* Special case for empty strings */
7615     if (PyUnicode_GET_SIZE(self) == 0)
7616         return PyBool_FromLong(0);
7617
7618     e = p + PyUnicode_GET_SIZE(self);
7619     for (; p < e; p++) {
7620         if (!Py_UNICODE_ISDECIMAL(*p))
7621             return PyBool_FromLong(0);
7622     }
7623     return PyBool_FromLong(1);
7624 }
7625
7626 PyDoc_STRVAR(isdigit__doc__,
7627              "S.isdigit() -> bool\n\
7628 \n\
7629 Return True if all characters in S are digits\n\
7630 and there is at least one character in S, False otherwise.");
7631
7632 static PyObject*
7633 unicode_isdigit(PyUnicodeObject *self)
7634 {
7635     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7636     register const Py_UNICODE *e;
7637
7638     /* Shortcut for single character strings */
7639     if (PyUnicode_GET_SIZE(self) == 1 &&
7640         Py_UNICODE_ISDIGIT(*p))
7641         return PyBool_FromLong(1);
7642
7643     /* Special case for empty strings */
7644     if (PyUnicode_GET_SIZE(self) == 0)
7645         return PyBool_FromLong(0);
7646
7647     e = p + PyUnicode_GET_SIZE(self);
7648     for (; p < e; p++) {
7649         if (!Py_UNICODE_ISDIGIT(*p))
7650             return PyBool_FromLong(0);
7651     }
7652     return PyBool_FromLong(1);
7653 }
7654
7655 PyDoc_STRVAR(isnumeric__doc__,
7656              "S.isnumeric() -> bool\n\
7657 \n\
7658 Return True if there are only numeric characters in S,\n\
7659 False otherwise.");
7660
7661 static PyObject*
7662 unicode_isnumeric(PyUnicodeObject *self)
7663 {
7664     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7665     register const Py_UNICODE *e;
7666
7667     /* Shortcut for single character strings */
7668     if (PyUnicode_GET_SIZE(self) == 1 &&
7669         Py_UNICODE_ISNUMERIC(*p))
7670         return PyBool_FromLong(1);
7671
7672     /* Special case for empty strings */
7673     if (PyUnicode_GET_SIZE(self) == 0)
7674         return PyBool_FromLong(0);
7675
7676     e = p + PyUnicode_GET_SIZE(self);
7677     for (; p < e; p++) {
7678         if (!Py_UNICODE_ISNUMERIC(*p))
7679             return PyBool_FromLong(0);
7680     }
7681     return PyBool_FromLong(1);
7682 }
7683
7684 int
7685 PyUnicode_IsIdentifier(PyObject *self)
7686 {
7687     register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7688     register const Py_UNICODE *e;
7689
7690     /* Special case for empty strings */
7691     if (PyUnicode_GET_SIZE(self) == 0)
7692         return 0;
7693
7694     /* PEP 3131 says that the first character must be in
7695        XID_Start and subsequent characters in XID_Continue,
7696        and for the ASCII range, the 2.x rules apply (i.e
7697        start with letters and underscore, continue with
7698        letters, digits, underscore). However, given the current
7699        definition of XID_Start and XID_Continue, it is sufficient
7700        to check just for these, except that _ must be allowed
7701        as starting an identifier.  */
7702     if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7703         return 0;
7704
7705     e = p + PyUnicode_GET_SIZE(self);
7706     for (p++; p < e; p++) {
7707         if (!_PyUnicode_IsXidContinue(*p))
7708             return 0;
7709     }
7710     return 1;
7711 }
7712
7713 PyDoc_STRVAR(isidentifier__doc__,
7714              "S.isidentifier() -> bool\n\
7715 \n\
7716 Return True if S is a valid identifier according\n\
7717 to the language definition.");
7718
7719 static PyObject*
7720 unicode_isidentifier(PyObject *self)
7721 {
7722     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7723 }
7724
7725 PyDoc_STRVAR(isprintable__doc__,
7726              "S.isprintable() -> bool\n\
7727 \n\
7728 Return True if all characters in S are considered\n\
7729 printable in repr() or S is empty, False otherwise.");
7730
7731 static PyObject*
7732 unicode_isprintable(PyObject *self)
7733 {
7734     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7735     register const Py_UNICODE *e;
7736
7737     /* Shortcut for single character strings */
7738     if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7739         Py_RETURN_TRUE;
7740     }
7741
7742     e = p + PyUnicode_GET_SIZE(self);
7743     for (; p < e; p++) {
7744         if (!Py_UNICODE_ISPRINTABLE(*p)) {
7745             Py_RETURN_FALSE;
7746         }
7747     }
7748     Py_RETURN_TRUE;
7749 }
7750
7751 PyDoc_STRVAR(join__doc__,
7752              "S.join(iterable) -> str\n\
7753 \n\
7754 Return a string which is the concatenation of the strings in the\n\
7755 iterable.  The separator between elements is S.");
7756
7757 static PyObject*
7758 unicode_join(PyObject *self, PyObject *data)
7759 {
7760     return PyUnicode_Join(self, data);
7761 }
7762
7763 static Py_ssize_t
7764 unicode_length(PyUnicodeObject *self)
7765 {
7766     return self->length;
7767 }
7768
7769 PyDoc_STRVAR(ljust__doc__,
7770              "S.ljust(width[, fillchar]) -> str\n\
7771 \n\
7772 Return S left-justified in a Unicode string of length width. Padding is\n\
7773 done using the specified fill character (default is a space).");
7774
7775 static PyObject *
7776 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7777 {
7778     Py_ssize_t width;
7779     Py_UNICODE fillchar = ' ';
7780
7781     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7782         return NULL;
7783
7784     if (self->length >= width && PyUnicode_CheckExact(self)) {
7785         Py_INCREF(self);
7786         return (PyObject*) self;
7787     }
7788
7789     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7790 }
7791
7792 PyDoc_STRVAR(lower__doc__,
7793              "S.lower() -> str\n\
7794 \n\
7795 Return a copy of the string S converted to lowercase.");
7796
7797 static PyObject*
7798 unicode_lower(PyUnicodeObject *self)
7799 {
7800     return fixup(self, fixlower);
7801 }
7802
7803 #define LEFTSTRIP 0
7804 #define RIGHTSTRIP 1
7805 #define BOTHSTRIP 2
7806
7807 /* Arrays indexed by above */
7808 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7809
7810 #define STRIPNAME(i) (stripformat[i]+3)
7811
7812 /* externally visible for str.strip(unicode) */
7813 PyObject *
7814 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7815 {
7816     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7817     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7818     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7819     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7820     Py_ssize_t i, j;
7821
7822     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7823
7824     i = 0;
7825     if (striptype != RIGHTSTRIP) {
7826         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7827             i++;
7828         }
7829     }
7830
7831     j = len;
7832     if (striptype != LEFTSTRIP) {
7833         do {
7834             j--;
7835         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7836         j++;
7837     }
7838
7839     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7840         Py_INCREF(self);
7841         return (PyObject*)self;
7842     }
7843     else
7844         return PyUnicode_FromUnicode(s+i, j-i);
7845 }
7846
7847
7848 static PyObject *
7849 do_strip(PyUnicodeObject *self, int striptype)
7850 {
7851     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7852     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7853
7854     i = 0;
7855     if (striptype != RIGHTSTRIP) {
7856         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7857             i++;
7858         }
7859     }
7860
7861     j = len;
7862     if (striptype != LEFTSTRIP) {
7863         do {
7864             j--;
7865         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7866         j++;
7867     }
7868
7869     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7870         Py_INCREF(self);
7871         return (PyObject*)self;
7872     }
7873     else
7874         return PyUnicode_FromUnicode(s+i, j-i);
7875 }
7876
7877
7878 static PyObject *
7879 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7880 {
7881     PyObject *sep = NULL;
7882
7883     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7884         return NULL;
7885
7886     if (sep != NULL && sep != Py_None) {
7887         if (PyUnicode_Check(sep))
7888             return _PyUnicode_XStrip(self, striptype, sep);
7889         else {
7890             PyErr_Format(PyExc_TypeError,
7891                          "%s arg must be None or str",
7892                          STRIPNAME(striptype));
7893             return NULL;
7894         }
7895     }
7896
7897     return do_strip(self, striptype);
7898 }
7899
7900
7901 PyDoc_STRVAR(strip__doc__,
7902              "S.strip([chars]) -> str\n\
7903 \n\
7904 Return a copy of the string S with leading and trailing\n\
7905 whitespace removed.\n\
7906 If chars is given and not None, remove characters in chars instead.");
7907
7908 static PyObject *
7909 unicode_strip(PyUnicodeObject *self, PyObject *args)
7910 {
7911     if (PyTuple_GET_SIZE(args) == 0)
7912         return do_strip(self, BOTHSTRIP); /* Common case */
7913     else
7914         return do_argstrip(self, BOTHSTRIP, args);
7915 }
7916
7917
7918 PyDoc_STRVAR(lstrip__doc__,
7919              "S.lstrip([chars]) -> str\n\
7920 \n\
7921 Return a copy of the string S with leading whitespace removed.\n\
7922 If chars is given and not None, remove characters in chars instead.");
7923
7924 static PyObject *
7925 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7926 {
7927     if (PyTuple_GET_SIZE(args) == 0)
7928         return do_strip(self, LEFTSTRIP); /* Common case */
7929     else
7930         return do_argstrip(self, LEFTSTRIP, args);
7931 }
7932
7933
7934 PyDoc_STRVAR(rstrip__doc__,
7935              "S.rstrip([chars]) -> str\n\
7936 \n\
7937 Return a copy of the string S with trailing whitespace removed.\n\
7938 If chars is given and not None, remove characters in chars instead.");
7939
7940 static PyObject *
7941 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7942 {
7943     if (PyTuple_GET_SIZE(args) == 0)
7944         return do_strip(self, RIGHTSTRIP); /* Common case */
7945     else
7946         return do_argstrip(self, RIGHTSTRIP, args);
7947 }
7948
7949
7950 static PyObject*
7951 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7952 {
7953     PyUnicodeObject *u;
7954     Py_UNICODE *p;
7955     Py_ssize_t nchars;
7956     size_t nbytes;
7957
7958     if (len < 1) {
7959         Py_INCREF(unicode_empty);
7960         return (PyObject *)unicode_empty;
7961     }
7962
7963     if (len == 1 && PyUnicode_CheckExact(str)) {
7964         /* no repeat, return original string */
7965         Py_INCREF(str);
7966         return (PyObject*) str;
7967     }
7968
7969     /* ensure # of chars needed doesn't overflow int and # of bytes
7970      * needed doesn't overflow size_t
7971      */
7972     nchars = len * str->length;
7973     if (nchars / len != str->length) {
7974         PyErr_SetString(PyExc_OverflowError,
7975                         "repeated string is too long");
7976         return NULL;
7977     }
7978     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7979     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7980         PyErr_SetString(PyExc_OverflowError,
7981                         "repeated string is too long");
7982         return NULL;
7983     }
7984     u = _PyUnicode_New(nchars);
7985     if (!u)
7986         return NULL;
7987
7988     p = u->str;
7989
7990     if (str->length == 1) {
7991         Py_UNICODE_FILL(p, str->str[0], len);
7992     } else {
7993         Py_ssize_t done = str->length; /* number of characters copied this far */
7994         Py_UNICODE_COPY(p, str->str, str->length);
7995         while (done < nchars) {
7996             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7997             Py_UNICODE_COPY(p+done, p, n);
7998             done += n;
7999         }
8000     }
8001
8002     return (PyObject*) u;
8003 }
8004
8005 PyObject *PyUnicode_Replace(PyObject *obj,
8006                             PyObject *subobj,
8007                             PyObject *replobj,
8008                             Py_ssize_t maxcount)
8009 {
8010     PyObject *self;
8011     PyObject *str1;
8012     PyObject *str2;
8013     PyObject *result;
8014
8015     self = PyUnicode_FromObject(obj);
8016     if (self == NULL)
8017         return NULL;
8018     str1 = PyUnicode_FromObject(subobj);
8019     if (str1 == NULL) {
8020         Py_DECREF(self);
8021         return NULL;
8022     }
8023     str2 = PyUnicode_FromObject(replobj);
8024     if (str2 == NULL) {
8025         Py_DECREF(self);
8026         Py_DECREF(str1);
8027         return NULL;
8028     }
8029     result = replace((PyUnicodeObject *)self,
8030                      (PyUnicodeObject *)str1,
8031                      (PyUnicodeObject *)str2,
8032                      maxcount);
8033     Py_DECREF(self);
8034     Py_DECREF(str1);
8035     Py_DECREF(str2);
8036     return result;
8037 }
8038
8039 PyDoc_STRVAR(replace__doc__,
8040              "S.replace(old, new[, count]) -> str\n\
8041 \n\
8042 Return a copy of S with all occurrences of substring\n\
8043 old replaced by new.  If the optional argument count is\n\
8044 given, only the first count occurrences are replaced.");
8045
8046 static PyObject*
8047 unicode_replace(PyUnicodeObject *self, PyObject *args)
8048 {
8049     PyUnicodeObject *str1;
8050     PyUnicodeObject *str2;
8051     Py_ssize_t maxcount = -1;
8052     PyObject *result;
8053
8054     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8055         return NULL;
8056     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8057     if (str1 == NULL)
8058         return NULL;
8059     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8060     if (str2 == NULL) {
8061         Py_DECREF(str1);
8062         return NULL;
8063     }
8064
8065     result = replace(self, str1, str2, maxcount);
8066
8067     Py_DECREF(str1);
8068     Py_DECREF(str2);
8069     return result;
8070 }
8071
8072 static
8073 PyObject *unicode_repr(PyObject *unicode)
8074 {
8075     PyObject *repr;
8076     Py_UNICODE *p;
8077     Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8078     Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8079
8080     /* XXX(nnorwitz): rather than over-allocating, it would be
8081        better to choose a different scheme.  Perhaps scan the
8082        first N-chars of the string and allocate based on that size.
8083     */
8084     /* Initial allocation is based on the longest-possible unichr
8085        escape.
8086
8087        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8088        unichr, so in this case it's the longest unichr escape. In
8089        narrow (UTF-16) builds this is five chars per source unichr
8090        since there are two unichrs in the surrogate pair, so in narrow
8091        (UTF-16) builds it's not the longest unichr escape.
8092
8093        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8094        so in the narrow (UTF-16) build case it's the longest unichr
8095        escape.
8096     */
8097
8098     repr = PyUnicode_FromUnicode(NULL,
8099                                  2 /* quotes */
8100 #ifdef Py_UNICODE_WIDE
8101                                  + 10*size
8102 #else
8103                                  + 6*size
8104 #endif
8105                                  + 1);
8106     if (repr == NULL)
8107         return NULL;
8108
8109     p = PyUnicode_AS_UNICODE(repr);
8110
8111     /* Add quote */
8112     *p++ = (findchar(s, size, '\'') &&
8113             !findchar(s, size, '"')) ? '"' : '\'';
8114     while (size-- > 0) {
8115         Py_UNICODE ch = *s++;
8116
8117         /* Escape quotes and backslashes */
8118         if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8119             *p++ = '\\';
8120             *p++ = ch;
8121             continue;
8122         }
8123
8124         /* Map special whitespace to '\t', \n', '\r' */
8125         if (ch == '\t') {
8126             *p++ = '\\';
8127             *p++ = 't';
8128         }
8129         else if (ch == '\n') {
8130             *p++ = '\\';
8131             *p++ = 'n';
8132         }
8133         else if (ch == '\r') {
8134             *p++ = '\\';
8135             *p++ = 'r';
8136         }
8137
8138         /* Map non-printable US ASCII to '\xhh' */
8139         else if (ch < ' ' || ch == 0x7F) {
8140             *p++ = '\\';
8141             *p++ = 'x';
8142             *p++ = hexdigits[(ch >> 4) & 0x000F];
8143             *p++ = hexdigits[ch & 0x000F];
8144         }
8145
8146         /* Copy ASCII characters as-is */
8147         else if (ch < 0x7F) {
8148             *p++ = ch;
8149         }
8150
8151         /* Non-ASCII characters */
8152         else {
8153             Py_UCS4 ucs = ch;
8154
8155 #ifndef Py_UNICODE_WIDE
8156             Py_UNICODE ch2 = 0;
8157             /* Get code point from surrogate pair */
8158             if (size > 0) {
8159                 ch2 = *s;
8160                 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8161                     && ch2 <= 0xDFFF) {
8162                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8163                         + 0x00010000;
8164                     s++;
8165                     size--;
8166                 }
8167             }
8168 #endif
8169             /* Map Unicode whitespace and control characters
8170                (categories Z* and C* except ASCII space)
8171             */
8172             if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8173                 /* Map 8-bit characters to '\xhh' */
8174                 if (ucs <= 0xff) {
8175                     *p++ = '\\';
8176                     *p++ = 'x';
8177                     *p++ = hexdigits[(ch >> 4) & 0x000F];
8178                     *p++ = hexdigits[ch & 0x000F];
8179                 }
8180                 /* Map 21-bit characters to '\U00xxxxxx' */
8181                 else if (ucs >= 0x10000) {
8182                     *p++ = '\\';
8183                     *p++ = 'U';
8184                     *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8185                     *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8186                     *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8187                     *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8188                     *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8189                     *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8190                     *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8191                     *p++ = hexdigits[ucs & 0x0000000F];
8192                 }
8193                 /* Map 16-bit characters to '\uxxxx' */
8194                 else {
8195                     *p++ = '\\';
8196                     *p++ = 'u';
8197                     *p++ = hexdigits[(ucs >> 12) & 0x000F];
8198                     *p++ = hexdigits[(ucs >> 8) & 0x000F];
8199                     *p++ = hexdigits[(ucs >> 4) & 0x000F];
8200                     *p++ = hexdigits[ucs & 0x000F];
8201                 }
8202             }
8203             /* Copy characters as-is */
8204             else {
8205                 *p++ = ch;
8206 #ifndef Py_UNICODE_WIDE
8207                 if (ucs >= 0x10000)
8208                     *p++ = ch2;
8209 #endif
8210             }
8211         }
8212     }
8213     /* Add quote */
8214     *p++ = PyUnicode_AS_UNICODE(repr)[0];
8215
8216     *p = '\0';
8217     PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8218     return repr;
8219 }
8220
8221 PyDoc_STRVAR(rfind__doc__,
8222              "S.rfind(sub[, start[, end]]) -> int\n\
8223 \n\
8224 Return the highest index in S where substring sub is found,\n\
8225 such that sub is contained within s[start:end].  Optional\n\
8226 arguments start and end are interpreted as in slice notation.\n\
8227 \n\
8228 Return -1 on failure.");
8229
8230 static PyObject *
8231 unicode_rfind(PyUnicodeObject *self, PyObject *args)
8232 {
8233     PyObject *substring;
8234     Py_ssize_t start;
8235     Py_ssize_t end;
8236     Py_ssize_t result;
8237
8238     if (!_ParseTupleFinds(args, &substring, &start, &end))
8239         return NULL;
8240
8241     result = stringlib_rfind_slice(
8242         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8243         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8244         start, end
8245         );
8246
8247     Py_DECREF(substring);
8248
8249     return PyLong_FromSsize_t(result);
8250 }
8251
8252 PyDoc_STRVAR(rindex__doc__,
8253              "S.rindex(sub[, start[, end]]) -> int\n\
8254 \n\
8255 Like S.rfind() but raise ValueError when the substring is not found.");
8256
8257 static PyObject *
8258 unicode_rindex(PyUnicodeObject *self, PyObject *args)
8259 {
8260     PyObject *substring;
8261     Py_ssize_t start;
8262     Py_ssize_t end;
8263     Py_ssize_t result;
8264
8265     if (!_ParseTupleFinds(args, &substring, &start, &end))
8266         return NULL;
8267
8268     result = stringlib_rfind_slice(
8269         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8270         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8271         start, end
8272         );
8273
8274     Py_DECREF(substring);
8275
8276     if (result < 0) {
8277         PyErr_SetString(PyExc_ValueError, "substring not found");
8278         return NULL;
8279     }
8280     return PyLong_FromSsize_t(result);
8281 }
8282
8283 PyDoc_STRVAR(rjust__doc__,
8284              "S.rjust(width[, fillchar]) -> str\n\
8285 \n\
8286 Return S right-justified in a string of length width. Padding is\n\
8287 done using the specified fill character (default is a space).");
8288
8289 static PyObject *
8290 unicode_rjust(PyUnicodeObject *self, PyObject *args)
8291 {
8292     Py_ssize_t width;
8293     Py_UNICODE fillchar = ' ';
8294
8295     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8296         return NULL;
8297
8298     if (self->length >= width && PyUnicode_CheckExact(self)) {
8299         Py_INCREF(self);
8300         return (PyObject*) self;
8301     }
8302
8303     return (PyObject*) pad(self, width - self->length, 0, fillchar);
8304 }
8305
8306 PyObject *PyUnicode_Split(PyObject *s,
8307                           PyObject *sep,
8308                           Py_ssize_t maxsplit)
8309 {
8310     PyObject *result;
8311
8312     s = PyUnicode_FromObject(s);
8313     if (s == NULL)
8314         return NULL;
8315     if (sep != NULL) {
8316         sep = PyUnicode_FromObject(sep);
8317         if (sep == NULL) {
8318             Py_DECREF(s);
8319             return NULL;
8320         }
8321     }
8322
8323     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8324
8325     Py_DECREF(s);
8326     Py_XDECREF(sep);
8327     return result;
8328 }
8329
8330 PyDoc_STRVAR(split__doc__,
8331              "S.split([sep[, maxsplit]]) -> list of strings\n\
8332 \n\
8333 Return a list of the words in S, using sep as the\n\
8334 delimiter string.  If maxsplit is given, at most maxsplit\n\
8335 splits are done. If sep is not specified or is None, any\n\
8336 whitespace string is a separator and empty strings are\n\
8337 removed from the result.");
8338
8339 static PyObject*
8340 unicode_split(PyUnicodeObject *self, PyObject *args)
8341 {
8342     PyObject *substring = Py_None;
8343     Py_ssize_t maxcount = -1;
8344
8345     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8346         return NULL;
8347
8348     if (substring == Py_None)
8349         return split(self, NULL, maxcount);
8350     else if (PyUnicode_Check(substring))
8351         return split(self, (PyUnicodeObject *)substring, maxcount);
8352     else
8353         return PyUnicode_Split((PyObject *)self, substring, maxcount);
8354 }
8355
8356 PyObject *
8357 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8358 {
8359     PyObject* str_obj;
8360     PyObject* sep_obj;
8361     PyObject* out;
8362
8363     str_obj = PyUnicode_FromObject(str_in);
8364     if (!str_obj)
8365         return NULL;
8366     sep_obj = PyUnicode_FromObject(sep_in);
8367     if (!sep_obj) {
8368         Py_DECREF(str_obj);
8369         return NULL;
8370     }
8371
8372     out = stringlib_partition(
8373         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8374         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8375         );
8376
8377     Py_DECREF(sep_obj);
8378     Py_DECREF(str_obj);
8379
8380     return out;
8381 }
8382
8383
8384 PyObject *
8385 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8386 {
8387     PyObject* str_obj;
8388     PyObject* sep_obj;
8389     PyObject* out;
8390
8391     str_obj = PyUnicode_FromObject(str_in);
8392     if (!str_obj)
8393         return NULL;
8394     sep_obj = PyUnicode_FromObject(sep_in);
8395     if (!sep_obj) {
8396         Py_DECREF(str_obj);
8397         return NULL;
8398     }
8399
8400     out = stringlib_rpartition(
8401         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8402         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8403         );
8404
8405     Py_DECREF(sep_obj);
8406     Py_DECREF(str_obj);
8407
8408     return out;
8409 }
8410
8411 PyDoc_STRVAR(partition__doc__,
8412              "S.partition(sep) -> (head, sep, tail)\n\
8413 \n\
8414 Search for the separator sep in S, and return the part before it,\n\
8415 the separator itself, and the part after it.  If the separator is not\n\
8416 found, return S and two empty strings.");
8417
8418 static PyObject*
8419 unicode_partition(PyUnicodeObject *self, PyObject *separator)
8420 {
8421     return PyUnicode_Partition((PyObject *)self, separator);
8422 }
8423
8424 PyDoc_STRVAR(rpartition__doc__,
8425              "S.rpartition(sep) -> (head, sep, tail)\n\
8426 \n\
8427 Search for the separator sep in S, starting at the end of S, and return\n\
8428 the part before it, the separator itself, and the part after it.  If the\n\
8429 separator is not found, return two empty strings and S.");
8430
8431 static PyObject*
8432 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8433 {
8434     return PyUnicode_RPartition((PyObject *)self, separator);
8435 }
8436
8437 PyObject *PyUnicode_RSplit(PyObject *s,
8438                            PyObject *sep,
8439                            Py_ssize_t maxsplit)
8440 {
8441     PyObject *result;
8442
8443     s = PyUnicode_FromObject(s);
8444     if (s == NULL)
8445         return NULL;
8446     if (sep != NULL) {
8447         sep = PyUnicode_FromObject(sep);
8448         if (sep == NULL) {
8449             Py_DECREF(s);
8450             return NULL;
8451         }
8452     }
8453
8454     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8455
8456     Py_DECREF(s);
8457     Py_XDECREF(sep);
8458     return result;
8459 }
8460
8461 PyDoc_STRVAR(rsplit__doc__,
8462              "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8463 \n\
8464 Return a list of the words in S, using sep as the\n\
8465 delimiter string, starting at the end of the string and\n\
8466 working to the front.  If maxsplit is given, at most maxsplit\n\
8467 splits are done. If sep is not specified, any whitespace string\n\
8468 is a separator.");
8469
8470 static PyObject*
8471 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8472 {
8473     PyObject *substring = Py_None;
8474     Py_ssize_t maxcount = -1;
8475
8476     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8477         return NULL;
8478
8479     if (substring == Py_None)
8480         return rsplit(self, NULL, maxcount);
8481     else if (PyUnicode_Check(substring))
8482         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8483     else
8484         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8485 }
8486
8487 PyDoc_STRVAR(splitlines__doc__,
8488              "S.splitlines([keepends]) -> list of strings\n\
8489 \n\
8490 Return a list of the lines in S, breaking at line boundaries.\n\
8491 Line breaks are not included in the resulting list unless keepends\n\
8492 is given and true.");
8493
8494 static PyObject*
8495 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8496 {
8497     int keepends = 0;
8498
8499     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8500         return NULL;
8501
8502     return PyUnicode_Splitlines((PyObject *)self, keepends);
8503 }
8504
8505 static
8506 PyObject *unicode_str(PyObject *self)
8507 {
8508     if (PyUnicode_CheckExact(self)) {
8509         Py_INCREF(self);
8510         return self;
8511     } else
8512         /* Subtype -- return genuine unicode string with the same value. */
8513         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8514                                      PyUnicode_GET_SIZE(self));
8515 }
8516
8517 PyDoc_STRVAR(swapcase__doc__,
8518              "S.swapcase() -> str\n\
8519 \n\
8520 Return a copy of S with uppercase characters converted to lowercase\n\
8521 and vice versa.");
8522
8523 static PyObject*
8524 unicode_swapcase(PyUnicodeObject *self)
8525 {
8526     return fixup(self, fixswapcase);
8527 }
8528
8529 PyDoc_STRVAR(maketrans__doc__,
8530              "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8531 \n\
8532 Return a translation table usable for str.translate().\n\
8533 If there is only one argument, it must be a dictionary mapping Unicode\n\
8534 ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8535 Character keys will be then converted to ordinals.\n\
8536 If there are two arguments, they must be strings of equal length, and\n\
8537 in the resulting dictionary, each character in x will be mapped to the\n\
8538 character at the same position in y. If there is a third argument, it\n\
8539 must be a string, whose characters will be mapped to None in the result.");
8540
8541 static PyObject*
8542 unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8543 {
8544     PyObject *x, *y = NULL, *z = NULL;
8545     PyObject *new = NULL, *key, *value;
8546     Py_ssize_t i = 0;
8547     int res;
8548
8549     if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8550         return NULL;
8551     new = PyDict_New();
8552     if (!new)
8553         return NULL;
8554     if (y != NULL) {
8555         /* x must be a string too, of equal length */
8556         Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8557         if (!PyUnicode_Check(x)) {
8558             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8559                             "be a string if there is a second argument");
8560             goto err;
8561         }
8562         if (PyUnicode_GET_SIZE(x) != ylen) {
8563             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8564                             "arguments must have equal length");
8565             goto err;
8566         }
8567         /* create entries for translating chars in x to those in y */
8568         for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8569             key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8570             value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8571             if (!key || !value)
8572                 goto err;
8573             res = PyDict_SetItem(new, key, value);
8574             Py_DECREF(key);
8575             Py_DECREF(value);
8576             if (res < 0)
8577                 goto err;
8578         }
8579         /* create entries for deleting chars in z */
8580         if (z != NULL) {
8581             for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8582                 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8583                 if (!key)
8584                     goto err;
8585                 res = PyDict_SetItem(new, key, Py_None);
8586                 Py_DECREF(key);
8587                 if (res < 0)
8588                     goto err;
8589             }
8590         }
8591     } else {
8592         /* x must be a dict */
8593         if (!PyDict_CheckExact(x)) {
8594             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8595                             "to maketrans it must be a dict");
8596             goto err;
8597         }
8598         /* copy entries into the new dict, converting string keys to int keys */
8599         while (PyDict_Next(x, &i, &key, &value)) {
8600             if (PyUnicode_Check(key)) {
8601                 /* convert string keys to integer keys */
8602                 PyObject *newkey;
8603                 if (PyUnicode_GET_SIZE(key) != 1) {
8604                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
8605                                     "table must be of length 1");
8606                     goto err;
8607                 }
8608                 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8609                 if (!newkey)
8610                     goto err;
8611                 res = PyDict_SetItem(new, newkey, value);
8612                 Py_DECREF(newkey);
8613                 if (res < 0)
8614                     goto err;
8615             } else if (PyLong_Check(key)) {
8616                 /* just keep integer keys */
8617                 if (PyDict_SetItem(new, key, value) < 0)
8618                     goto err;
8619             } else {
8620                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8621                                 "be strings or integers");
8622                 goto err;
8623             }
8624         }
8625     }
8626     return new;
8627   err:
8628     Py_DECREF(new);
8629     return NULL;
8630 }
8631
8632 PyDoc_STRVAR(translate__doc__,
8633              "S.translate(table) -> str\n\
8634 \n\
8635 Return a copy of the string S, where all characters have been mapped\n\
8636 through the given translation table, which must be a mapping of\n\
8637 Unicode ordinals to Unicode ordinals, strings, or None.\n\
8638 Unmapped characters are left untouched. Characters mapped to None\n\
8639 are deleted.");
8640
8641 static PyObject*
8642 unicode_translate(PyUnicodeObject *self, PyObject *table)
8643 {
8644     return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8645 }
8646
8647 PyDoc_STRVAR(upper__doc__,
8648              "S.upper() -> str\n\
8649 \n\
8650 Return a copy of S converted to uppercase.");
8651
8652 static PyObject*
8653 unicode_upper(PyUnicodeObject *self)
8654 {
8655     return fixup(self, fixupper);
8656 }
8657
8658 PyDoc_STRVAR(zfill__doc__,
8659              "S.zfill(width) -> str\n\
8660 \n\
8661 Pad a numeric string S with zeros on the left, to fill a field\n\
8662 of the specified width. The string S is never truncated.");
8663
8664 static PyObject *
8665 unicode_zfill(PyUnicodeObject *self, PyObject *args)
8666 {
8667     Py_ssize_t fill;
8668     PyUnicodeObject *u;
8669
8670     Py_ssize_t width;
8671     if (!PyArg_ParseTuple(args, "n:zfill", &width))
8672         return NULL;
8673
8674     if (self->length >= width) {
8675         if (PyUnicode_CheckExact(self)) {
8676             Py_INCREF(self);
8677             return (PyObject*) self;
8678         }
8679         else
8680             return PyUnicode_FromUnicode(
8681                 PyUnicode_AS_UNICODE(self),
8682                 PyUnicode_GET_SIZE(self)
8683                 );
8684     }
8685
8686     fill = width - self->length;
8687
8688     u = pad(self, fill, 0, '0');
8689
8690     if (u == NULL)
8691         return NULL;
8692
8693     if (u->str[fill] == '+' || u->str[fill] == '-') {
8694         /* move sign to beginning of string */
8695         u->str[0] = u->str[fill];
8696         u->str[fill] = '0';
8697     }
8698
8699     return (PyObject*) u;
8700 }
8701
8702 #if 0
8703 static PyObject*
8704 unicode_freelistsize(PyUnicodeObject *self)
8705 {
8706     return PyLong_FromLong(numfree);
8707 }
8708 #endif
8709
8710 PyDoc_STRVAR(startswith__doc__,
8711              "S.startswith(prefix[, start[, end]]) -> bool\n\
8712 \n\
8713 Return True if S starts with the specified prefix, False otherwise.\n\
8714 With optional start, test S beginning at that position.\n\
8715 With optional end, stop comparing S at that position.\n\
8716 prefix can also be a tuple of strings to try.");
8717
8718 static PyObject *
8719 unicode_startswith(PyUnicodeObject *self,
8720                    PyObject *args)
8721 {
8722     PyObject *subobj;
8723     PyUnicodeObject *substring;
8724     Py_ssize_t start = 0;
8725     Py_ssize_t end = PY_SSIZE_T_MAX;
8726     int result;
8727
8728     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8729                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8730         return NULL;
8731     if (PyTuple_Check(subobj)) {
8732         Py_ssize_t i;
8733         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8734             substring = (PyUnicodeObject *)PyUnicode_FromObject(
8735                 PyTuple_GET_ITEM(subobj, i));
8736             if (substring == NULL)
8737                 return NULL;
8738             result = tailmatch(self, substring, start, end, -1);
8739             Py_DECREF(substring);
8740             if (result) {
8741                 Py_RETURN_TRUE;
8742             }
8743         }
8744         /* nothing matched */
8745         Py_RETURN_FALSE;
8746     }
8747     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8748     if (substring == NULL)
8749         return NULL;
8750     result = tailmatch(self, substring, start, end, -1);
8751     Py_DECREF(substring);
8752     return PyBool_FromLong(result);
8753 }
8754
8755
8756 PyDoc_STRVAR(endswith__doc__,
8757              "S.endswith(suffix[, start[, end]]) -> bool\n\
8758 \n\
8759 Return True if S ends with the specified suffix, False otherwise.\n\
8760 With optional start, test S beginning at that position.\n\
8761 With optional end, stop comparing S at that position.\n\
8762 suffix can also be a tuple of strings to try.");
8763
8764 static PyObject *
8765 unicode_endswith(PyUnicodeObject *self,
8766                  PyObject *args)
8767 {
8768     PyObject *subobj;
8769     PyUnicodeObject *substring;
8770     Py_ssize_t start = 0;
8771     Py_ssize_t end = PY_SSIZE_T_MAX;
8772     int result;
8773
8774     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8775                           _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8776         return NULL;
8777     if (PyTuple_Check(subobj)) {
8778         Py_ssize_t i;
8779         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8780             substring = (PyUnicodeObject *)PyUnicode_FromObject(
8781                 PyTuple_GET_ITEM(subobj, i));
8782             if (substring == NULL)
8783                 return NULL;
8784             result = tailmatch(self, substring, start, end, +1);
8785             Py_DECREF(substring);
8786             if (result) {
8787                 Py_RETURN_TRUE;
8788             }
8789         }
8790         Py_RETURN_FALSE;
8791     }
8792     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8793     if (substring == NULL)
8794         return NULL;
8795
8796     result = tailmatch(self, substring, start, end, +1);
8797     Py_DECREF(substring);
8798     return PyBool_FromLong(result);
8799 }
8800
8801 #include "stringlib/string_format.h"
8802
8803 PyDoc_STRVAR(format__doc__,
8804              "S.format(*args, **kwargs) -> str\n\
8805 \n\
8806 ");
8807
8808 static PyObject *
8809 unicode__format__(PyObject* self, PyObject* args)
8810 {
8811     PyObject *format_spec;
8812
8813     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8814         return NULL;
8815
8816     return _PyUnicode_FormatAdvanced(self,
8817                                      PyUnicode_AS_UNICODE(format_spec),
8818                                      PyUnicode_GET_SIZE(format_spec));
8819 }
8820
8821 PyDoc_STRVAR(p_format__doc__,
8822              "S.__format__(format_spec) -> str\n\
8823 \n\
8824 ");
8825
8826 static PyObject *
8827 unicode__sizeof__(PyUnicodeObject *v)
8828 {
8829     return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8830                               sizeof(Py_UNICODE) * (v->length + 1));
8831 }
8832
8833 PyDoc_STRVAR(sizeof__doc__,
8834              "S.__sizeof__() -> size of S in memory, in bytes");
8835
8836 static PyObject *
8837 unicode_getnewargs(PyUnicodeObject *v)
8838 {
8839     return Py_BuildValue("(u#)", v->str, v->length);
8840 }
8841
8842
8843 static PyMethodDef unicode_methods[] = {
8844
8845     /* Order is according to common usage: often used methods should
8846        appear first, since lookup is done sequentially. */
8847
8848     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8849     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8850     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8851     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8852     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8853     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8854     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8855     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8856     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8857     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8858     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8859     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8860     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8861     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8862     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8863     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8864     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8865     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8866     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8867     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8868     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8869     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8870     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8871     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8872     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8873     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8874     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8875     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8876     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8877     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8878     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8879     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8880     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8881     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8882     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8883     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8884     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8885     {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8886     {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8887     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8888     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8889     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8890     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8891     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8892     {"maketrans", (PyCFunction) unicode_maketrans,
8893      METH_VARARGS | METH_STATIC, maketrans__doc__},
8894     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8895 #if 0
8896     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8897 #endif
8898
8899 #if 0
8900     /* This one is just used for debugging the implementation. */
8901     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8902 #endif
8903
8904     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8905     {NULL, NULL}
8906 };
8907
8908 static PyObject *
8909 unicode_mod(PyObject *v, PyObject *w)
8910 {
8911     if (!PyUnicode_Check(v)) {
8912         Py_INCREF(Py_NotImplemented);
8913         return Py_NotImplemented;
8914     }
8915     return PyUnicode_Format(v, w);
8916 }
8917
8918 static PyNumberMethods unicode_as_number = {
8919     0,              /*nb_add*/
8920     0,              /*nb_subtract*/
8921     0,              /*nb_multiply*/
8922     unicode_mod,            /*nb_remainder*/
8923 };
8924
8925 static PySequenceMethods unicode_as_sequence = {
8926     (lenfunc) unicode_length,       /* sq_length */
8927     PyUnicode_Concat,           /* sq_concat */
8928     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8929     (ssizeargfunc) unicode_getitem,     /* sq_item */
8930     0,                  /* sq_slice */
8931     0,                  /* sq_ass_item */
8932     0,                  /* sq_ass_slice */
8933     PyUnicode_Contains,         /* sq_contains */
8934 };
8935
8936 static PyObject*
8937 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8938 {
8939     if (PyIndex_Check(item)) {
8940         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8941         if (i == -1 && PyErr_Occurred())
8942             return NULL;
8943         if (i < 0)
8944             i += PyUnicode_GET_SIZE(self);
8945         return unicode_getitem(self, i);
8946     } else if (PySlice_Check(item)) {
8947         Py_ssize_t start, stop, step, slicelength, cur, i;
8948         Py_UNICODE* source_buf;
8949         Py_UNICODE* result_buf;
8950         PyObject* result;
8951
8952         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8953                                  &start, &stop, &step, &slicelength) < 0) {
8954             return NULL;
8955         }
8956
8957         if (slicelength <= 0) {
8958             return PyUnicode_FromUnicode(NULL, 0);
8959         } else if (start == 0 && step == 1 && slicelength == self->length &&
8960                    PyUnicode_CheckExact(self)) {
8961             Py_INCREF(self);
8962             return (PyObject *)self;
8963         } else if (step == 1) {
8964             return PyUnicode_FromUnicode(self->str + start, slicelength);
8965         } else {
8966             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8967             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8968                                                        sizeof(Py_UNICODE));
8969
8970             if (result_buf == NULL)
8971                 return PyErr_NoMemory();
8972
8973             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8974                 result_buf[i] = source_buf[cur];
8975             }
8976
8977             result = PyUnicode_FromUnicode(result_buf, slicelength);
8978             PyObject_FREE(result_buf);
8979             return result;
8980         }
8981     } else {
8982         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8983         return NULL;
8984     }
8985 }
8986
8987 static PyMappingMethods unicode_as_mapping = {
8988     (lenfunc)unicode_length,        /* mp_length */
8989     (binaryfunc)unicode_subscript,  /* mp_subscript */
8990     (objobjargproc)0,           /* mp_ass_subscript */
8991 };
8992
8993
8994 /* Helpers for PyUnicode_Format() */
8995
8996 static PyObject *
8997 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8998 {
8999     Py_ssize_t argidx = *p_argidx;
9000     if (argidx < arglen) {
9001         (*p_argidx)++;
9002         if (arglen < 0)
9003             return args;
9004         else
9005             return PyTuple_GetItem(args, argidx);
9006     }
9007     PyErr_SetString(PyExc_TypeError,
9008                     "not enough arguments for format string");
9009     return NULL;
9010 }
9011
9012 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
9013
9014 static PyObject *
9015 formatfloat(PyObject *v, int flags, int prec, int type)
9016 {
9017     char *p;
9018     PyObject *result;
9019     double x;
9020
9021     x = PyFloat_AsDouble(v);
9022     if (x == -1.0 && PyErr_Occurred())
9023         return NULL;
9024
9025     if (prec < 0)
9026         prec = 6;
9027
9028     p = PyOS_double_to_string(x, type, prec,
9029                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9030     if (p == NULL)
9031         return NULL;
9032     result = PyUnicode_FromStringAndSize(p, strlen(p));
9033     PyMem_Free(p);
9034     return result;
9035 }
9036
9037 static PyObject*
9038 formatlong(PyObject *val, int flags, int prec, int type)
9039 {
9040     char *buf;
9041     int len;
9042     PyObject *str; /* temporary string object. */
9043     PyObject *result;
9044
9045     str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9046     if (!str)
9047         return NULL;
9048     result = PyUnicode_FromStringAndSize(buf, len);
9049     Py_DECREF(str);
9050     return result;
9051 }
9052
9053 static int
9054 formatchar(Py_UNICODE *buf,
9055            size_t buflen,
9056            PyObject *v)
9057 {
9058     /* presume that the buffer is at least 3 characters long */
9059     if (PyUnicode_Check(v)) {
9060         if (PyUnicode_GET_SIZE(v) == 1) {
9061             buf[0] = PyUnicode_AS_UNICODE(v)[0];
9062             buf[1] = '\0';
9063             return 1;
9064         }
9065 #ifndef Py_UNICODE_WIDE
9066         if (PyUnicode_GET_SIZE(v) == 2) {
9067             /* Decode a valid surrogate pair */
9068             int c0 = PyUnicode_AS_UNICODE(v)[0];
9069             int c1 = PyUnicode_AS_UNICODE(v)[1];
9070             if (0xD800 <= c0 && c0 <= 0xDBFF &&
9071                 0xDC00 <= c1 && c1 <= 0xDFFF) {
9072                 buf[0] = c0;
9073                 buf[1] = c1;
9074                 buf[2] = '\0';
9075                 return 2;
9076             }
9077         }
9078 #endif
9079         goto onError;
9080     }
9081     else {
9082         /* Integer input truncated to a character */
9083         long x;
9084         x = PyLong_AsLong(v);
9085         if (x == -1 && PyErr_Occurred())
9086             goto onError;
9087
9088         if (x < 0 || x > 0x10ffff) {
9089             PyErr_SetString(PyExc_OverflowError,
9090                             "%c arg not in range(0x110000)");
9091             return -1;
9092         }
9093
9094 #ifndef Py_UNICODE_WIDE
9095         if (x > 0xffff) {
9096             x -= 0x10000;
9097             buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9098             buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9099             return 2;
9100         }
9101 #endif
9102         buf[0] = (Py_UNICODE) x;
9103         buf[1] = '\0';
9104         return 1;
9105     }
9106
9107   onError:
9108     PyErr_SetString(PyExc_TypeError,
9109                     "%c requires int or char");
9110     return -1;
9111 }
9112
9113 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9114    FORMATBUFLEN is the length of the buffer in which chars are formatted.
9115 */
9116 #define FORMATBUFLEN (size_t)10
9117
9118 PyObject *PyUnicode_Format(PyObject *format,
9119                            PyObject *args)
9120 {
9121     Py_UNICODE *fmt, *res;
9122     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9123     int args_owned = 0;
9124     PyUnicodeObject *result = NULL;
9125     PyObject *dict = NULL;
9126     PyObject *uformat;
9127
9128     if (format == NULL || args == NULL) {
9129         PyErr_BadInternalCall();
9130         return NULL;
9131     }
9132     uformat = PyUnicode_FromObject(format);
9133     if (uformat == NULL)
9134         return NULL;
9135     fmt = PyUnicode_AS_UNICODE(uformat);
9136     fmtcnt = PyUnicode_GET_SIZE(uformat);
9137
9138     reslen = rescnt = fmtcnt + 100;
9139     result = _PyUnicode_New(reslen);
9140     if (result == NULL)
9141         goto onError;
9142     res = PyUnicode_AS_UNICODE(result);
9143
9144     if (PyTuple_Check(args)) {
9145         arglen = PyTuple_Size(args);
9146         argidx = 0;
9147     }
9148     else {
9149         arglen = -1;
9150         argidx = -2;
9151     }
9152     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9153         !PyUnicode_Check(args))
9154         dict = args;
9155
9156     while (--fmtcnt >= 0) {
9157         if (*fmt != '%') {
9158             if (--rescnt < 0) {
9159                 rescnt = fmtcnt + 100;
9160                 reslen += rescnt;
9161                 if (_PyUnicode_Resize(&result, reslen) < 0)
9162                     goto onError;
9163                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9164                 --rescnt;
9165             }
9166             *res++ = *fmt++;
9167         }
9168         else {
9169             /* Got a format specifier */
9170             int flags = 0;
9171             Py_ssize_t width = -1;
9172             int prec = -1;
9173             Py_UNICODE c = '\0';
9174             Py_UNICODE fill;
9175             int isnumok;
9176             PyObject *v = NULL;
9177             PyObject *temp = NULL;
9178             Py_UNICODE *pbuf;
9179             Py_UNICODE sign;
9180             Py_ssize_t len;
9181             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9182
9183             fmt++;
9184             if (*fmt == '(') {
9185                 Py_UNICODE *keystart;
9186                 Py_ssize_t keylen;
9187                 PyObject *key;
9188                 int pcount = 1;
9189
9190                 if (dict == NULL) {
9191                     PyErr_SetString(PyExc_TypeError,
9192                                     "format requires a mapping");
9193                     goto onError;
9194                 }
9195                 ++fmt;
9196                 --fmtcnt;
9197                 keystart = fmt;
9198                 /* Skip over balanced parentheses */
9199                 while (pcount > 0 && --fmtcnt >= 0) {
9200                     if (*fmt == ')')
9201                         --pcount;
9202                     else if (*fmt == '(')
9203                         ++pcount;
9204                     fmt++;
9205                 }
9206                 keylen = fmt - keystart - 1;
9207                 if (fmtcnt < 0 || pcount > 0) {
9208                     PyErr_SetString(PyExc_ValueError,
9209                                     "incomplete format key");
9210                     goto onError;
9211                 }
9212 #if 0
9213                 /* keys are converted to strings using UTF-8 and
9214                    then looked up since Python uses strings to hold
9215                    variables names etc. in its namespaces and we
9216                    wouldn't want to break common idioms. */
9217                 key = PyUnicode_EncodeUTF8(keystart,
9218                                            keylen,
9219                                            NULL);
9220 #else
9221                 key = PyUnicode_FromUnicode(keystart, keylen);
9222 #endif
9223                 if (key == NULL)
9224                     goto onError;
9225                 if (args_owned) {
9226                     Py_DECREF(args);
9227                     args_owned = 0;
9228                 }
9229                 args = PyObject_GetItem(dict, key);
9230                 Py_DECREF(key);
9231                 if (args == NULL) {
9232                     goto onError;
9233                 }
9234                 args_owned = 1;
9235                 arglen = -1;
9236                 argidx = -2;
9237             }
9238             while (--fmtcnt >= 0) {
9239                 switch (c = *fmt++) {
9240                 case '-': flags |= F_LJUST; continue;
9241                 case '+': flags |= F_SIGN; continue;
9242                 case ' ': flags |= F_BLANK; continue;
9243                 case '#': flags |= F_ALT; continue;
9244                 case '0': flags |= F_ZERO; continue;
9245                 }
9246                 break;
9247             }
9248             if (c == '*') {
9249                 v = getnextarg(args, arglen, &argidx);
9250                 if (v == NULL)
9251                     goto onError;
9252                 if (!PyLong_Check(v)) {
9253                     PyErr_SetString(PyExc_TypeError,
9254                                     "* wants int");
9255                     goto onError;
9256                 }
9257                 width = PyLong_AsLong(v);
9258                 if (width == -1 && PyErr_Occurred())
9259                     goto onError;
9260                 if (width < 0) {
9261                     flags |= F_LJUST;
9262                     width = -width;
9263                 }
9264                 if (--fmtcnt >= 0)
9265                     c = *fmt++;
9266             }
9267             else if (c >= '0' && c <= '9') {
9268                 width = c - '0';
9269                 while (--fmtcnt >= 0) {
9270                     c = *fmt++;
9271                     if (c < '0' || c > '9')
9272                         break;
9273                     if ((width*10) / 10 != width) {
9274                         PyErr_SetString(PyExc_ValueError,
9275                                         "width too big");
9276                         goto onError;
9277                     }
9278                     width = width*10 + (c - '0');
9279                 }
9280             }
9281             if (c == '.') {
9282                 prec = 0;
9283                 if (--fmtcnt >= 0)
9284                     c = *fmt++;
9285                 if (c == '*') {
9286                     v = getnextarg(args, arglen, &argidx);
9287                     if (v == NULL)
9288                         goto onError;
9289                     if (!PyLong_Check(v)) {
9290                         PyErr_SetString(PyExc_TypeError,
9291                                         "* wants int");
9292                         goto onError;
9293                     }
9294                     prec = PyLong_AsLong(v);
9295                     if (prec == -1 && PyErr_Occurred())
9296                         goto onError;
9297                     if (prec < 0)
9298                         prec = 0;
9299                     if (--fmtcnt >= 0)
9300                         c = *fmt++;
9301                 }
9302                 else if (c >= '0' && c <= '9') {
9303                     prec = c - '0';
9304                     while (--fmtcnt >= 0) {
9305                         c = *fmt++;
9306                         if (c < '0' || c > '9')
9307                             break;
9308                         if ((prec*10) / 10 != prec) {
9309                             PyErr_SetString(PyExc_ValueError,
9310                                             "prec too big");
9311                             goto onError;
9312                         }
9313                         prec = prec*10 + (c - '0');
9314                     }
9315                 }
9316             } /* prec */
9317             if (fmtcnt >= 0) {
9318                 if (c == 'h' || c == 'l' || c == 'L') {
9319                     if (--fmtcnt >= 0)
9320                         c = *fmt++;
9321                 }
9322             }
9323             if (fmtcnt < 0) {
9324                 PyErr_SetString(PyExc_ValueError,
9325                                 "incomplete format");
9326                 goto onError;
9327             }
9328             if (c != '%') {
9329                 v = getnextarg(args, arglen, &argidx);
9330                 if (v == NULL)
9331                     goto onError;
9332             }
9333             sign = 0;
9334             fill = ' ';
9335             switch (c) {
9336
9337             case '%':
9338                 pbuf = formatbuf;
9339                 /* presume that buffer length is at least 1 */
9340                 pbuf[0] = '%';
9341                 len = 1;
9342                 break;
9343
9344             case 's':
9345             case 'r':
9346             case 'a':
9347                 if (PyUnicode_CheckExact(v) && c == 's') {
9348                     temp = v;
9349                     Py_INCREF(temp);
9350                 }
9351                 else {
9352                     if (c == 's')
9353                         temp = PyObject_Str(v);
9354                     else if (c == 'r')
9355                         temp = PyObject_Repr(v);
9356                     else
9357                         temp = PyObject_ASCII(v);
9358                     if (temp == NULL)
9359                         goto onError;
9360                     if (PyUnicode_Check(temp))
9361                         /* nothing to do */;
9362                     else {
9363                         Py_DECREF(temp);
9364                         PyErr_SetString(PyExc_TypeError,
9365                                         "%s argument has non-string str()");
9366                         goto onError;
9367                     }
9368                 }
9369                 pbuf = PyUnicode_AS_UNICODE(temp);
9370                 len = PyUnicode_GET_SIZE(temp);
9371                 if (prec >= 0 && len > prec)
9372                     len = prec;
9373                 break;
9374
9375             case 'i':
9376             case 'd':
9377             case 'u':
9378             case 'o':
9379             case 'x':
9380             case 'X':
9381                 if (c == 'i')
9382                     c = 'd';
9383                 isnumok = 0;
9384                 if (PyNumber_Check(v)) {
9385                     PyObject *iobj=NULL;
9386
9387                     if (PyLong_Check(v)) {
9388                         iobj = v;
9389                         Py_INCREF(iobj);
9390                     }
9391                     else {
9392                         iobj = PyNumber_Long(v);
9393                     }
9394                     if (iobj!=NULL) {
9395                         if (PyLong_Check(iobj)) {
9396                             isnumok = 1;
9397                             temp = formatlong(iobj, flags, prec, c);
9398                             Py_DECREF(iobj);
9399                             if (!temp)
9400                                 goto onError;
9401                             pbuf = PyUnicode_AS_UNICODE(temp);
9402                             len = PyUnicode_GET_SIZE(temp);
9403                             sign = 1;
9404                         }
9405                         else {
9406                             Py_DECREF(iobj);
9407                         }
9408                     }
9409                 }
9410                 if (!isnumok) {
9411                     PyErr_Format(PyExc_TypeError,
9412                                  "%%%c format: a number is required, "
9413                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9414                     goto onError;
9415                 }
9416                 if (flags & F_ZERO)
9417                     fill = '0';
9418                 break;
9419
9420             case 'e':
9421             case 'E':
9422             case 'f':
9423             case 'F':
9424             case 'g':
9425             case 'G':
9426                 temp = formatfloat(v, flags, prec, c);
9427                 if (!temp)
9428                     goto onError;
9429                 pbuf = PyUnicode_AS_UNICODE(temp);
9430                 len = PyUnicode_GET_SIZE(temp);
9431                 sign = 1;
9432                 if (flags & F_ZERO)
9433                     fill = '0';
9434                 break;
9435
9436             case 'c':
9437                 pbuf = formatbuf;
9438                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9439                 if (len < 0)
9440                     goto onError;
9441                 break;
9442
9443             default:
9444                 PyErr_Format(PyExc_ValueError,
9445                              "unsupported format character '%c' (0x%x) "
9446                              "at index %zd",
9447                              (31<=c && c<=126) ? (char)c : '?',
9448                              (int)c,
9449                              (Py_ssize_t)(fmt - 1 -
9450                                           PyUnicode_AS_UNICODE(uformat)));
9451                 goto onError;
9452             }
9453             if (sign) {
9454                 if (*pbuf == '-' || *pbuf == '+') {
9455                     sign = *pbuf++;
9456                     len--;
9457                 }
9458                 else if (flags & F_SIGN)
9459                     sign = '+';
9460                 else if (flags & F_BLANK)
9461                     sign = ' ';
9462                 else
9463                     sign = 0;
9464             }
9465             if (width < len)
9466                 width = len;
9467             if (rescnt - (sign != 0) < width) {
9468                 reslen -= rescnt;
9469                 rescnt = width + fmtcnt + 100;
9470                 reslen += rescnt;
9471                 if (reslen < 0) {
9472                     Py_XDECREF(temp);
9473                     PyErr_NoMemory();
9474                     goto onError;
9475                 }
9476                 if (_PyUnicode_Resize(&result, reslen) < 0) {
9477                     Py_XDECREF(temp);
9478                     goto onError;
9479                 }
9480                 res = PyUnicode_AS_UNICODE(result)
9481                     + reslen - rescnt;
9482             }
9483             if (sign) {
9484                 if (fill != ' ')
9485                     *res++ = sign;
9486                 rescnt--;
9487                 if (width > len)
9488                     width--;
9489             }
9490             if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9491                 assert(pbuf[0] == '0');
9492                 assert(pbuf[1] == c);
9493                 if (fill != ' ') {
9494                     *res++ = *pbuf++;
9495                     *res++ = *pbuf++;
9496                 }
9497                 rescnt -= 2;
9498                 width -= 2;
9499                 if (width < 0)
9500                     width = 0;
9501                 len -= 2;
9502             }
9503             if (width > len && !(flags & F_LJUST)) {
9504                 do {
9505                     --rescnt;
9506                     *res++ = fill;
9507                 } while (--width > len);
9508             }
9509             if (fill == ' ') {
9510                 if (sign)
9511                     *res++ = sign;
9512                 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9513                     assert(pbuf[0] == '0');
9514                     assert(pbuf[1] == c);
9515                     *res++ = *pbuf++;
9516                     *res++ = *pbuf++;
9517                 }
9518             }
9519             Py_UNICODE_COPY(res, pbuf, len);
9520             res += len;
9521             rescnt -= len;
9522             while (--width >= len) {
9523                 --rescnt;
9524                 *res++ = ' ';
9525             }
9526             if (dict && (argidx < arglen) && c != '%') {
9527                 PyErr_SetString(PyExc_TypeError,
9528                                 "not all arguments converted during string formatting");
9529                 Py_XDECREF(temp);
9530                 goto onError;
9531             }
9532             Py_XDECREF(temp);
9533         } /* '%' */
9534     } /* until end */
9535     if (argidx < arglen && !dict) {
9536         PyErr_SetString(PyExc_TypeError,
9537                         "not all arguments converted during string formatting");
9538         goto onError;
9539     }
9540
9541     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9542         goto onError;
9543     if (args_owned) {
9544         Py_DECREF(args);
9545     }
9546     Py_DECREF(uformat);
9547     return (PyObject *)result;
9548
9549   onError:
9550     Py_XDECREF(result);
9551     Py_DECREF(uformat);
9552     if (args_owned) {
9553         Py_DECREF(args);
9554     }
9555     return NULL;
9556 }
9557
9558 static PyObject *
9559 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9560
9561 static PyObject *
9562 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9563 {
9564     PyObject *x = NULL;
9565     static char *kwlist[] = {"object", "encoding", "errors", 0};
9566     char *encoding = NULL;
9567     char *errors = NULL;
9568
9569     if (type != &PyUnicode_Type)
9570         return unicode_subtype_new(type, args, kwds);
9571     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9572                                      kwlist, &x, &encoding, &errors))
9573         return NULL;
9574     if (x == NULL)
9575         return (PyObject *)_PyUnicode_New(0);
9576     if (encoding == NULL && errors == NULL)
9577         return PyObject_Str(x);
9578     else
9579         return PyUnicode_FromEncodedObject(x, encoding, errors);
9580 }
9581
9582 static PyObject *
9583 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9584 {
9585     PyUnicodeObject *tmp, *pnew;
9586     Py_ssize_t n;
9587
9588     assert(PyType_IsSubtype(type, &PyUnicode_Type));
9589     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9590     if (tmp == NULL)
9591         return NULL;
9592     assert(PyUnicode_Check(tmp));
9593     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9594     if (pnew == NULL) {
9595         Py_DECREF(tmp);
9596         return NULL;
9597     }
9598     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9599     if (pnew->str == NULL) {
9600         _Py_ForgetReference((PyObject *)pnew);
9601         PyObject_Del(pnew);
9602         Py_DECREF(tmp);
9603         return PyErr_NoMemory();
9604     }
9605     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9606     pnew->length = n;
9607     pnew->hash = tmp->hash;
9608     Py_DECREF(tmp);
9609     return (PyObject *)pnew;
9610 }
9611
9612 PyDoc_STRVAR(unicode_doc,
9613              "str(string[, encoding[, errors]]) -> str\n\
9614 \n\
9615 Create a new string object from the given encoded string.\n\
9616 encoding defaults to the current default string encoding.\n\
9617 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9618
9619 static PyObject *unicode_iter(PyObject *seq);
9620
9621 PyTypeObject PyUnicode_Type = {
9622     PyVarObject_HEAD_INIT(&PyType_Type, 0)
9623     "str",              /* tp_name */
9624     sizeof(PyUnicodeObject),        /* tp_size */
9625     0,                  /* tp_itemsize */
9626     /* Slots */
9627     (destructor)unicode_dealloc,    /* tp_dealloc */
9628     0,                  /* tp_print */
9629     0,                  /* tp_getattr */
9630     0,                  /* tp_setattr */
9631     0,                  /* tp_reserved */
9632     unicode_repr,           /* tp_repr */
9633     &unicode_as_number,         /* tp_as_number */
9634     &unicode_as_sequence,       /* tp_as_sequence */
9635     &unicode_as_mapping,        /* tp_as_mapping */
9636     (hashfunc) unicode_hash,        /* tp_hash*/
9637     0,                  /* tp_call*/
9638     (reprfunc) unicode_str,     /* tp_str */
9639     PyObject_GenericGetAttr,        /* tp_getattro */
9640     0,                  /* tp_setattro */
9641     0,                  /* tp_as_buffer */
9642     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9643     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9644     unicode_doc,            /* tp_doc */
9645     0,                  /* tp_traverse */
9646     0,                  /* tp_clear */
9647     PyUnicode_RichCompare,      /* tp_richcompare */
9648     0,                  /* tp_weaklistoffset */
9649     unicode_iter,           /* tp_iter */
9650     0,                  /* tp_iternext */
9651     unicode_methods,            /* tp_methods */
9652     0,                  /* tp_members */
9653     0,                  /* tp_getset */
9654     &PyBaseObject_Type,         /* tp_base */
9655     0,                  /* tp_dict */
9656     0,                  /* tp_descr_get */
9657     0,                  /* tp_descr_set */
9658     0,                  /* tp_dictoffset */
9659     0,                  /* tp_init */
9660     0,                  /* tp_alloc */
9661     unicode_new,            /* tp_new */
9662     PyObject_Del,           /* tp_free */
9663 };
9664
9665 /* Initialize the Unicode implementation */
9666
9667 void _PyUnicode_Init(void)
9668 {
9669     int i;
9670
9671     /* XXX - move this array to unicodectype.c ? */
9672     Py_UNICODE linebreak[] = {
9673         0x000A, /* LINE FEED */
9674         0x000D, /* CARRIAGE RETURN */
9675         0x001C, /* FILE SEPARATOR */
9676         0x001D, /* GROUP SEPARATOR */
9677         0x001E, /* RECORD SEPARATOR */
9678         0x0085, /* NEXT LINE */
9679         0x2028, /* LINE SEPARATOR */
9680         0x2029, /* PARAGRAPH SEPARATOR */
9681     };
9682
9683     /* Init the implementation */
9684     free_list = NULL;
9685     numfree = 0;
9686     unicode_empty = _PyUnicode_New(0);
9687     if (!unicode_empty)
9688         return;
9689
9690     for (i = 0; i < 256; i++)
9691         unicode_latin1[i] = NULL;
9692     if (PyType_Ready(&PyUnicode_Type) < 0)
9693         Py_FatalError("Can't initialize 'unicode'");
9694
9695     /* initialize the linebreak bloom filter */
9696     bloom_linebreak = make_bloom_mask(
9697         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9698         );
9699
9700     PyType_Ready(&EncodingMapType);
9701 }
9702
9703 /* Finalize the Unicode implementation */
9704
9705 int
9706 PyUnicode_ClearFreeList(void)
9707 {
9708     int freelist_size = numfree;
9709     PyUnicodeObject *u;
9710
9711     for (u = free_list; u != NULL;) {
9712         PyUnicodeObject *v = u;
9713         u = *(PyUnicodeObject **)u;
9714         if (v->str)
9715             PyObject_DEL(v->str);
9716         Py_XDECREF(v->defenc);
9717         PyObject_Del(v);
9718         numfree--;
9719     }
9720     free_list = NULL;
9721     assert(numfree == 0);
9722     return freelist_size;
9723 }
9724
9725 void
9726 _PyUnicode_Fini(void)
9727 {
9728     int i;
9729
9730     Py_XDECREF(unicode_empty);
9731     unicode_empty = NULL;
9732
9733     for (i = 0; i < 256; i++) {
9734         if (unicode_latin1[i]) {
9735             Py_DECREF(unicode_latin1[i]);
9736             unicode_latin1[i] = NULL;
9737         }
9738     }
9739     (void)PyUnicode_ClearFreeList();
9740 }
9741
9742 void
9743 PyUnicode_InternInPlace(PyObject **p)
9744 {
9745     register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9746     PyObject *t;
9747     if (s == NULL || !PyUnicode_Check(s))
9748         Py_FatalError(
9749             "PyUnicode_InternInPlace: unicode strings only please!");
9750     /* If it's a subclass, we don't really know what putting
9751        it in the interned dict might do. */
9752     if (!PyUnicode_CheckExact(s))
9753         return;
9754     if (PyUnicode_CHECK_INTERNED(s))
9755         return;
9756     if (interned == NULL) {
9757         interned = PyDict_New();
9758         if (interned == NULL) {
9759             PyErr_Clear(); /* Don't leave an exception */
9760             return;
9761         }
9762     }
9763     /* It might be that the GetItem call fails even
9764        though the key is present in the dictionary,
9765        namely when this happens during a stack overflow. */
9766     Py_ALLOW_RECURSION
9767         t = PyDict_GetItem(interned, (PyObject *)s);
9768     Py_END_ALLOW_RECURSION
9769
9770         if (t) {
9771             Py_INCREF(t);
9772             Py_DECREF(*p);
9773             *p = t;
9774             return;
9775         }
9776
9777     PyThreadState_GET()->recursion_critical = 1;
9778     if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9779         PyErr_Clear();
9780         PyThreadState_GET()->recursion_critical = 0;
9781         return;
9782     }
9783     PyThreadState_GET()->recursion_critical = 0;
9784     /* The two references in interned are not counted by refcnt.
9785        The deallocator will take care of this */
9786     Py_REFCNT(s) -= 2;
9787     PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9788 }
9789
9790 void
9791 PyUnicode_InternImmortal(PyObject **p)
9792 {
9793     PyUnicode_InternInPlace(p);
9794     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9795         PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9796         Py_INCREF(*p);
9797     }
9798 }
9799
9800 PyObject *
9801 PyUnicode_InternFromString(const char *cp)
9802 {
9803     PyObject *s = PyUnicode_FromString(cp);
9804     if (s == NULL)
9805         return NULL;
9806     PyUnicode_InternInPlace(&s);
9807     return s;
9808 }
9809
9810 void _Py_ReleaseInternedUnicodeStrings(void)
9811 {
9812     PyObject *keys;
9813     PyUnicodeObject *s;
9814     Py_ssize_t i, n;
9815     Py_ssize_t immortal_size = 0, mortal_size = 0;
9816
9817     if (interned == NULL || !PyDict_Check(interned))
9818         return;
9819     keys = PyDict_Keys(interned);
9820     if (keys == NULL || !PyList_Check(keys)) {
9821         PyErr_Clear();
9822         return;
9823     }
9824
9825     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9826        detector, interned unicode strings are not forcibly deallocated;
9827        rather, we give them their stolen references back, and then clear
9828        and DECREF the interned dict. */
9829
9830     n = PyList_GET_SIZE(keys);
9831     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9832             n);
9833     for (i = 0; i < n; i++) {
9834         s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9835         switch (s->state) {
9836         case SSTATE_NOT_INTERNED:
9837             /* XXX Shouldn't happen */
9838             break;
9839         case SSTATE_INTERNED_IMMORTAL:
9840             Py_REFCNT(s) += 1;
9841             immortal_size += s->length;
9842             break;
9843         case SSTATE_INTERNED_MORTAL:
9844             Py_REFCNT(s) += 2;
9845             mortal_size += s->length;
9846             break;
9847         default:
9848             Py_FatalError("Inconsistent interned string state.");
9849         }
9850         s->state = SSTATE_NOT_INTERNED;
9851     }
9852     fprintf(stderr, "total size of all interned strings: "
9853             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9854             "mortal/immortal\n", mortal_size, immortal_size);
9855     Py_DECREF(keys);
9856     PyDict_Clear(interned);
9857     Py_DECREF(interned);
9858     interned = NULL;
9859 }
9860
9861
9862 /********************* Unicode Iterator **************************/
9863
9864 typedef struct {
9865     PyObject_HEAD
9866     Py_ssize_t it_index;
9867     PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9868 } unicodeiterobject;
9869
9870 static void
9871 unicodeiter_dealloc(unicodeiterobject *it)
9872 {
9873     _PyObject_GC_UNTRACK(it);
9874     Py_XDECREF(it->it_seq);
9875     PyObject_GC_Del(it);
9876 }
9877
9878 static int
9879 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9880 {
9881     Py_VISIT(it->it_seq);
9882     return 0;
9883 }
9884
9885 static PyObject *
9886 unicodeiter_next(unicodeiterobject *it)
9887 {
9888     PyUnicodeObject *seq;
9889     PyObject *item;
9890
9891     assert(it != NULL);
9892     seq = it->it_seq;
9893     if (seq == NULL)
9894         return NULL;
9895     assert(PyUnicode_Check(seq));
9896
9897     if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9898         item = PyUnicode_FromUnicode(
9899             PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9900         if (item != NULL)
9901             ++it->it_index;
9902         return item;
9903     }
9904
9905     Py_DECREF(seq);
9906     it->it_seq = NULL;
9907     return NULL;
9908 }
9909
9910 static PyObject *
9911 unicodeiter_len(unicodeiterobject *it)
9912 {
9913     Py_ssize_t len = 0;
9914     if (it->it_seq)
9915         len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9916     return PyLong_FromSsize_t(len);
9917 }
9918
9919 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9920
9921 static PyMethodDef unicodeiter_methods[] = {
9922     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9923      length_hint_doc},
9924     {NULL,      NULL}       /* sentinel */
9925 };
9926
9927 PyTypeObject PyUnicodeIter_Type = {
9928     PyVarObject_HEAD_INIT(&PyType_Type, 0)
9929     "str_iterator",         /* tp_name */
9930     sizeof(unicodeiterobject),      /* tp_basicsize */
9931     0,                  /* tp_itemsize */
9932     /* methods */
9933     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
9934     0,                  /* tp_print */
9935     0,                  /* tp_getattr */
9936     0,                  /* tp_setattr */
9937     0,                  /* tp_reserved */
9938     0,                  /* tp_repr */
9939     0,                  /* tp_as_number */
9940     0,                  /* tp_as_sequence */
9941     0,                  /* tp_as_mapping */
9942     0,                  /* tp_hash */
9943     0,                  /* tp_call */
9944     0,                  /* tp_str */
9945     PyObject_GenericGetAttr,        /* tp_getattro */
9946     0,                  /* tp_setattro */
9947     0,                  /* tp_as_buffer */
9948     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9949     0,                  /* tp_doc */
9950     (traverseproc)unicodeiter_traverse, /* tp_traverse */
9951     0,                  /* tp_clear */
9952     0,                  /* tp_richcompare */
9953     0,                  /* tp_weaklistoffset */
9954     PyObject_SelfIter,          /* tp_iter */
9955     (iternextfunc)unicodeiter_next,     /* tp_iternext */
9956     unicodeiter_methods,            /* tp_methods */
9957     0,
9958 };
9959
9960 static PyObject *
9961 unicode_iter(PyObject *seq)
9962 {
9963     unicodeiterobject *it;
9964
9965     if (!PyUnicode_Check(seq)) {
9966         PyErr_BadInternalCall();
9967         return NULL;
9968     }
9969     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9970     if (it == NULL)
9971         return NULL;
9972     it->it_index = 0;
9973     Py_INCREF(seq);
9974     it->it_seq = (PyUnicodeObject *)seq;
9975     _PyObject_GC_TRACK(it);
9976     return (PyObject *)it;
9977 }
9978
9979 size_t
9980 Py_UNICODE_strlen(const Py_UNICODE *u)
9981 {
9982     int res = 0;
9983     while(*u++)
9984         res++;
9985     return res;
9986 }
9987
9988 Py_UNICODE*
9989 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9990 {
9991     Py_UNICODE *u = s1;
9992     while ((*u++ = *s2++));
9993     return s1;
9994 }
9995
9996 Py_UNICODE*
9997 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9998 {
9999     Py_UNICODE *u = s1;
10000     while ((*u++ = *s2++))
10001         if (n-- == 0)
10002             break;
10003     return s1;
10004 }
10005
10006 int
10007 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10008 {
10009     while (*s1 && *s2 && *s1 == *s2)
10010         s1++, s2++;
10011     if (*s1 && *s2)
10012         return (*s1 < *s2) ? -1 : +1;
10013     if (*s1)
10014         return 1;
10015     if (*s2)
10016         return -1;
10017     return 0;
10018 }
10019
10020 Py_UNICODE*
10021 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10022 {
10023     const Py_UNICODE *p;
10024     for (p = s; *p; p++)
10025         if (*p == c)
10026             return (Py_UNICODE*)p;
10027     return NULL;
10028 }
10029
10030
10031 #ifdef __cplusplus
10032 }
10033 #endif
10034
10035
10036 /*
10037   Local variables:
10038   c-basic-offset: 4
10039   indent-tabs-mode: nil
10040   End:
10041 */