Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15     Copyright (c) 1999 by Secret Labs AB
  16     Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117         0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * HORIZONTAL TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * VERTICAL TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123         0, 1, 1, 1, 1, 1, 0, 0,
 124         0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129         0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131         1, 0, 0, 0, 0, 0, 0, 0,
 132         0, 0, 0, 0, 0, 0, 0, 0,
 133         0, 0, 0, 0, 0, 0, 0, 0,
 134         0, 0, 0, 0, 0, 0, 0, 0,
 135
 136         0, 0, 0, 0, 0, 0, 0, 0,
 137         0, 0, 0, 0, 0, 0, 0, 0,
 138         0, 0, 0, 0, 0, 0, 0, 0,
 139         0, 0, 0, 0, 0, 0, 0, 0,
 140         0, 0, 0, 0, 0, 0, 0, 0,
 141         0, 0, 0, 0, 0, 0, 0, 0,
 142         0, 0, 0, 0, 0, 0, 0, 0,
 143         0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148         0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000D, * CARRIAGE RETURN */
 151         0, 0, 1, 0, 0, 1, 0, 0,
 152         0, 0, 0, 0, 0, 0, 0, 0,
 153 /*         0x001C, * FILE SEPARATOR */
 154 /*         0x001D, * GROUP SEPARATOR */
 155 /*         0x001E, * RECORD SEPARATOR */
 156         0, 0, 0, 0, 1, 1, 1, 0,
 157         0, 0, 0, 0, 0, 0, 0, 0,
 158         0, 0, 0, 0, 0, 0, 0, 0,
 159         0, 0, 0, 0, 0, 0, 0, 0,
 160         0, 0, 0, 0, 0, 0, 0, 0,
 161
 162         0, 0, 0, 0, 0, 0, 0, 0,
 163         0, 0, 0, 0, 0, 0, 0, 0,
 164         0, 0, 0, 0, 0, 0, 0, 0,
 165         0, 0, 0, 0, 0, 0, 0, 0,
 166         0, 0, 0, 0, 0, 0, 0, 0,
 167         0, 0, 0, 0, 0, 0, 0, 0,
 168         0, 0, 0, 0, 0, 0, 0, 0,
 169         0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177         return 0x10FFFF;
 178 #else
 179         /* This is actually an illegal character, so it should
 180            not be passed to unichr. */
 181         return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch) \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] : \
 201     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                       Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272  reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287        free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Ensure we won't overflow the size. */
 303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304         return (PyUnicodeObject *)PyErr_NoMemory();
 305     }
 306
 307     /* Unicode freelist & memory allocation */
 308     if (free_list) {
 309         unicode = free_list;
 310         free_list = *(PyUnicodeObject **)unicode;
 311         numfree--;
 312         if (unicode->str) {
 313             /* Keep-Alive optimization: we only upsize the buffer,
 314                never downsize it. */
 315             if ((unicode->length < length) &&
 316                 unicode_resize(unicode, length) < 0) {
 317                 PyObject_DEL(unicode->str);
 318                 unicode->str = NULL;
 319             }
 320         }
 321         else {
 322             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324         }
 325         PyObject_INIT(unicode, &PyUnicode_Type);
 326     }
 327     else {
 328         size_t new_size;
 329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330         if (unicode == NULL)
 331             return NULL;
 332         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334     }
 335
 336     if (!unicode->str) {
 337         PyErr_NoMemory();
 338         goto onError;
 339     }
 340     /* Initialize the first element to guard against cases where
 341      * the caller fails before initializing str -- unicode_resize()
 342      * reads str[0], and the Keep-Alive optimization can keep memory
 343      * allocated for str alive across a call to unicode_dealloc(unicode).
 344      * We don't want unicode_resize to read uninitialized memory in
 345      * that case.
 346      */
 347     unicode->str[0] = 0;
 348     unicode->str[length] = 0;
 349     unicode->length = length;
 350     unicode->hash = -1;
 351     unicode->defenc = NULL;
 352     return unicode;
 353
 354  onError:
 355     /* XXX UNREF/NEWREF interface should be more symmetrical */
 356     _Py_DEC_REFTOTAL;
 357     _Py_ForgetReference((PyObject *)unicode);
 358     PyObject_Del(unicode);
 359     return NULL;
 360 }
 361
 362 static
 363 void unicode_dealloc(register PyUnicodeObject *unicode)
 364 {
 365     if (PyUnicode_CheckExact(unicode) &&
 366         numfree < PyUnicode_MAXFREELIST) {
 367         /* Keep-Alive optimization */
 368         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369             PyObject_DEL(unicode->str);
 370             unicode->str = NULL;
 371             unicode->length = 0;
 372         }
 373         if (unicode->defenc) {
 374             Py_DECREF(unicode->defenc);
 375             unicode->defenc = NULL;
 376         }
 377         /* Add to free list */
 378         *(PyUnicodeObject **)unicode = free_list;
 379         free_list = unicode;
 380         numfree++;
 381     }
 382     else {
 383         PyObject_DEL(unicode->str);
 384         Py_XDECREF(unicode->defenc);
 385         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386     }
 387 }
 388
 389 static
 390 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 391 {
 392     register PyUnicodeObject *v;
 393
 394     /* Argument checks */
 395     if (unicode == NULL) {
 396         PyErr_BadInternalCall();
 397         return -1;
 398     }
 399     v = *unicode;
 400     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 401         PyErr_BadInternalCall();
 402         return -1;
 403     }
 404
 405     /* Resizing unicode_empty and single character objects is not
 406        possible since these are being shared. We simply return a fresh
 407        copy with the same Unicode content. */
 408     if (v->length != length &&
 409         (v == unicode_empty || v->length == 1)) {
 410         PyUnicodeObject *w = _PyUnicode_New(length);
 411         if (w == NULL)
 412             return -1;
 413         Py_UNICODE_COPY(w->str, v->str,
 414                         length < v->length ? length : v->length);
 415         Py_DECREF(*unicode);
 416         *unicode = w;
 417         return 0;
 418     }
 419
 420     /* Note that we don't have to modify *unicode for unshared Unicode
 421        objects, since we can modify them in-place. */
 422     return unicode_resize(v, length);
 423 }
 424
 425 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 426 {
 427     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 428 }
 429
 430 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 431                                 Py_ssize_t size)
 432 {
 433     PyUnicodeObject *unicode;
 434
 435     /* If the Unicode data is known at construction time, we can apply
 436        some optimizations which share commonly used objects. */
 437     if (u != NULL) {
 438
 439         /* Optimization for empty strings */
 440         if (size == 0 && unicode_empty != NULL) {
 441             Py_INCREF(unicode_empty);
 442             return (PyObject *)unicode_empty;
 443         }
 444
 445         /* Single character Unicode objects in the Latin-1 range are
 446            shared when using this constructor */
 447         if (size == 1 && *u < 256) {
 448             unicode = unicode_latin1[*u];
 449             if (!unicode) {
 450                 unicode = _PyUnicode_New(1);
 451                 if (!unicode)
 452                     return NULL;
 453                 unicode->str[0] = *u;
 454                 unicode_latin1[*u] = unicode;
 455             }
 456             Py_INCREF(unicode);
 457             return (PyObject *)unicode;
 458         }
 459     }
 460
 461     unicode = _PyUnicode_New(size);
 462     if (!unicode)
 463         return NULL;
 464
 465     /* Copy the Unicode data into the new object */
 466     if (u != NULL)
 467         Py_UNICODE_COPY(unicode->str, u, size);
 468
 469     return (PyObject *)unicode;
 470 }
 471
 472 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 473 {
 474     PyUnicodeObject *unicode;
 475
 476         if (size < 0) {
 477                 PyErr_SetString(PyExc_SystemError,
 478                     "Negative size passed to PyUnicode_FromStringAndSize");
 479                 return NULL;
 480         }
 481
 482     /* If the Unicode data is known at construction time, we can apply
 483        some optimizations which share commonly used objects.
 484        Also, this means the input must be UTF-8, so fall back to the
 485        UTF-8 decoder at the end. */
 486     if (u != NULL) {
 487
 488         /* Optimization for empty strings */
 489         if (size == 0 && unicode_empty != NULL) {
 490             Py_INCREF(unicode_empty);
 491             return (PyObject *)unicode_empty;
 492         }
 493
 494         /* Single characters are shared when using this constructor.
 495            Restrict to ASCII, since the input must be UTF-8. */
 496         if (size == 1 && Py_CHARMASK(*u) < 128) {
 497             unicode = unicode_latin1[Py_CHARMASK(*u)];
 498             if (!unicode) {
 499                 unicode = _PyUnicode_New(1);
 500                 if (!unicode)
 501                     return NULL;
 502                 unicode->str[0] = Py_CHARMASK(*u);
 503                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 504             }
 505             Py_INCREF(unicode);
 506             return (PyObject *)unicode;
 507         }
 508
 509         return PyUnicode_DecodeUTF8(u, size, NULL);
 510     }
 511
 512     unicode = _PyUnicode_New(size);
 513     if (!unicode)
 514         return NULL;
 515
 516     return (PyObject *)unicode;
 517 }
 518
 519 PyObject *PyUnicode_FromString(const char *u)
 520 {
 521     size_t size = strlen(u);
 522     if (size > PY_SSIZE_T_MAX) {
 523         PyErr_SetString(PyExc_OverflowError, "input too long");
 524         return NULL;
 525     }
 526
 527     return PyUnicode_FromStringAndSize(u, size);
 528 }
 529
 530 #ifdef HAVE_WCHAR_H
 531
 532 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 533                                  Py_ssize_t size)
 534 {
 535     PyUnicodeObject *unicode;
 536
 537     if (w == NULL) {
 538         PyErr_BadInternalCall();
 539         return NULL;
 540     }
 541
 542     unicode = _PyUnicode_New(size);
 543     if (!unicode)
 544         return NULL;
 545
 546     /* Copy the wchar_t data into the new object */
 547 #ifdef HAVE_USABLE_WCHAR_T
 548     memcpy(unicode->str, w, size * sizeof(wchar_t));
 549 #else
 550     {
 551         register Py_UNICODE *u;
 552         register Py_ssize_t i;
 553         u = PyUnicode_AS_UNICODE(unicode);
 554         for (i = size; i > 0; i--)
 555             *u++ = *w++;
 556     }
 557 #endif
 558
 559     return (PyObject *)unicode;
 560 }
 561
 562 static void
 563 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 564 {
 565         *fmt++ = '%';
 566         if (width) {
 567                 if (zeropad)
 568                         *fmt++ = '0';
 569                 fmt += sprintf(fmt, "%d", width);
 570         }
 571         if (precision)
 572                 fmt += sprintf(fmt, ".%d", precision);
 573         if (longflag)
 574                 *fmt++ = 'l';
 575         else if (size_tflag) {
 576                 char *f = PY_FORMAT_SIZE_T;
 577                 while (*f)
 578                         *fmt++ = *f++;
 579         }
 580         *fmt++ = c;
 581         *fmt = '\0';
 582 }
 583
 584 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 585
 586 PyObject *
 587 PyUnicode_FromFormatV(const char *format, va_list vargs)
 588 {
 589         va_list count;
 590         Py_ssize_t callcount = 0;
 591         PyObject **callresults = NULL;
 592         PyObject **callresult = NULL;
 593         Py_ssize_t n = 0;
 594         int width = 0;
 595         int precision = 0;
 596         int zeropad;
 597         const char* f;
 598         Py_UNICODE *s;
 599         PyObject *string;
 600         /* used by sprintf */
 601         char buffer[21];
 602         /* use abuffer instead of buffer, if we need more space
 603          * (which can happen if there's a format specifier with width). */
 604         char *abuffer = NULL;
 605         char *realbuffer;
 606         Py_ssize_t abuffersize = 0;
 607         char fmt[60]; /* should be enough for %0width.precisionld */
 608         const char *copy;
 609
 610 #ifdef VA_LIST_IS_ARRAY
 611         Py_MEMCPY(count, vargs, sizeof(va_list));
 612 #else
 613 #ifdef  __va_copy
 614         __va_copy(count, vargs);
 615 #else
 616         count = vargs;
 617 #endif
 618 #endif
 619         /* step 1: count the number of %S/%R format specifications
 620          * (we call PyObject_Str()/PyObject_Repr() for these objects
 621          * once during step 3 and put the result in an array) */
 622         for (f = format; *f; f++) {
 623                 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
 624                         ++callcount;
 625         }
 626         /* step 2: allocate memory for the results of
 627          * PyObject_Str()/PyObject_Repr() calls */
 628         if (callcount) {
 629                 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 630                 if (!callresults) {
 631                         PyErr_NoMemory();
 632                         return NULL;
 633                 }
 634                 callresult = callresults;
 635         }
 636         /* step 3: figure out how large a buffer we need */
 637         for (f = format; *f; f++) {
 638                 if (*f == '%') {
 639                         const char* p = f;
 640                         width = 0;
 641                         while (isdigit((unsigned)*f))
 642                                 width = (width*10) + *f++ - '0';
 643                         while (*++f && *f != '%' && !isalpha((unsigned)*f))
 644                                 ;
 645
 646                         /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 647                          * they don't affect the amount of space we reserve.
 648                          */
 649                         if ((*f == 'l' || *f == 'z') &&
 650                                         (f[1] == 'd' || f[1] == 'u'))
 651                                 ++f;
 652
 653                         switch (*f) {
 654                         case 'c':
 655                                 (void)va_arg(count, int);
 656                                 /* fall through... */
 657                         case '%':
 658                                 n++;
 659                                 break;
 660                         case 'd': case 'u': case 'i': case 'x':
 661                                 (void) va_arg(count, int);
 662                                 /* 20 bytes is enough to hold a 64-bit
 663                                    integer.  Decimal takes the most space.
 664                                    This isn't enough for octal.
 665                                    If a width is specified we need more
 666                                    (which we allocate later). */
 667                                 if (width < 20)
 668                                         width = 20;
 669                                 n += width;
 670                                 if (abuffersize < width)
 671                                         abuffersize = width;
 672                                 break;
 673                         case 's':
 674                         {
 675                                 /* UTF-8 */
 676                                 unsigned char*s;
 677                                 s = va_arg(count, unsigned char*);
 678                                 while (*s) {
 679                                         if (*s < 128) {
 680                                                 n++; s++;
 681                                         } else if (*s < 0xc0) {
 682                                                 /* invalid UTF-8 */
 683                                                 n++; s++;
 684                                         } else if (*s < 0xc0) {
 685                                                 n++;
 686                                                 s++; if(!*s)break;
 687                                                 s++;
 688                                         } else if (*s < 0xe0) {
 689                                                 n++;
 690                                                 s++; if(!*s)break;
 691                                                 s++; if(!*s)break;
 692                                                 s++;
 693                                         } else {
 694                                                 #ifdef Py_UNICODE_WIDE
 695                                                 n++;
 696                                                 #else
 697                                                 n+=2;
 698                                                 #endif
 699                                                 s++; if(!*s)break;
 700                                                 s++; if(!*s)break;
 701                                                 s++; if(!*s)break;
 702                                                 s++;
 703                                         }
 704                                 }
 705                                 break;
 706                         }
 707                         case 'U':
 708                         {
 709                                 PyObject *obj = va_arg(count, PyObject *);
 710                                 assert(obj && PyUnicode_Check(obj));
 711                                 n += PyUnicode_GET_SIZE(obj);
 712                                 break;
 713                         }
 714                         case 'V':
 715                         {
 716                                 PyObject *obj = va_arg(count, PyObject *);
 717                                 const char *str = va_arg(count, const char *);
 718                                 assert(obj || str);
 719                                 assert(!obj || PyUnicode_Check(obj));
 720                                 if (obj)
 721                                         n += PyUnicode_GET_SIZE(obj);
 722                                 else
 723                                         n += strlen(str);
 724                                 break;
 725                         }
 726                         case 'S':
 727                         {
 728                                 PyObject *obj = va_arg(count, PyObject *);
 729                                 PyObject *str;
 730                                 assert(obj);
 731                                 str = PyObject_Str(obj);
 732                                 if (!str)
 733                                         goto fail;
 734                                 n += PyUnicode_GET_SIZE(str);
 735                                 /* Remember the str and switch to the next slot */
 736                                 *callresult++ = str;
 737                                 break;
 738                         }
 739                         case 'R':
 740                         {
 741                                 PyObject *obj = va_arg(count, PyObject *);
 742                                 PyObject *repr;
 743                                 assert(obj);
 744                                 repr = PyObject_Repr(obj);
 745                                 if (!repr)
 746                                         goto fail;
 747                                 n += PyUnicode_GET_SIZE(repr);
 748                                 /* Remember the repr and switch to the next slot */
 749                                 *callresult++ = repr;
 750                                 break;
 751                         }
 752                         case 'p':
 753                                 (void) va_arg(count, int);
 754                                 /* maximum 64-bit pointer representation:
 755                                  * 0xffffffffffffffff
 756                                  * so 19 characters is enough.
 757                                  * XXX I count 18 -- what's the extra for?
 758                                  */
 759                                 n += 19;
 760                                 break;
 761                         default:
 762                                 /* if we stumble upon an unknown
 763                                    formatting code, copy the rest of
 764                                    the format string to the output
 765                                    string. (we cannot just skip the
 766                                    code, since there's no way to know
 767                                    what's in the argument list) */
 768                                 n += strlen(p);
 769                                 goto expand;
 770                         }
 771                 } else
 772                         n++;
 773         }
 774  expand:
 775         if (abuffersize > 20) {
 776                 abuffer = PyObject_Malloc(abuffersize);
 777                 if (!abuffer) {
 778                         PyErr_NoMemory();
 779                         goto fail;
 780                 }
 781                 realbuffer = abuffer;
 782         }
 783         else
 784                 realbuffer = buffer;
 785         /* step 4: fill the buffer */
 786         /* Since we've analyzed how much space we need for the worst case,
 787            we don't have to resize the string.
 788            There can be no errors beyond this point. */
 789         string = PyUnicode_FromUnicode(NULL, n);
 790         if (!string)
 791                 goto fail;
 792
 793         s = PyUnicode_AS_UNICODE(string);
 794         callresult = callresults;
 795
 796         for (f = format; *f; f++) {
 797                 if (*f == '%') {
 798                         const char* p = f++;
 799                         int longflag = 0;
 800                         int size_tflag = 0;
 801                         zeropad = (*f == '0');
 802                         /* parse the width.precision part */
 803                         width = 0;
 804                         while (isdigit((unsigned)*f))
 805                                 width = (width*10) + *f++ - '0';
 806                         precision = 0;
 807                         if (*f == '.') {
 808                                 f++;
 809                                 while (isdigit((unsigned)*f))
 810                                         precision = (precision*10) + *f++ - '0';
 811                         }
 812                         /* handle the long flag, but only for %ld and %lu.
 813                            others can be added when necessary. */
 814                         if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 815                                 longflag = 1;
 816                                 ++f;
 817                         }
 818                         /* handle the size_t flag. */
 819                         if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 820                                 size_tflag = 1;
 821                                 ++f;
 822                         }
 823
 824                         switch (*f) {
 825                         case 'c':
 826                                 *s++ = va_arg(vargs, int);
 827                                 break;
 828                         case 'd':
 829                                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 830                                 if (longflag)
 831                                         sprintf(realbuffer, fmt, va_arg(vargs, long));
 832                                 else if (size_tflag)
 833                                         sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 834                                 else
 835                                         sprintf(realbuffer, fmt, va_arg(vargs, int));
 836                                 appendstring(realbuffer);
 837                                 break;
 838                         case 'u':
 839                                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 840                                 if (longflag)
 841                                         sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 842                                 else if (size_tflag)
 843                                         sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 844                                 else
 845                                         sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 846                                 appendstring(realbuffer);
 847                                 break;
 848                         case 'i':
 849                                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 850                                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 851                                 appendstring(realbuffer);
 852                                 break;
 853                         case 'x':
 854                                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 855                                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 856                                 appendstring(realbuffer);
 857                                 break;
 858                         case 's':
 859                         {
 860                                 /* Parameter must be UTF-8 encoded.
 861                                    In case of encoding errors, use
 862                                    the replacement character. */
 863                                 PyObject *u;
 864                                 p = va_arg(vargs, char*);
 865                                 u = PyUnicode_DecodeUTF8(p, strlen(p),
 866                                                          "replace");
 867                                 if (!u)
 868                                         goto fail;
 869                                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
 870                                                 PyUnicode_GET_SIZE(u));
 871                                 s += PyUnicode_GET_SIZE(u);
 872                                 Py_DECREF(u);
 873                                 break;
 874                         }
 875                         case 'U':
 876                         {
 877                                 PyObject *obj = va_arg(vargs, PyObject *);
 878                                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 879                                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 880                                 s += size;
 881                                 break;
 882                         }
 883                         case 'V':
 884                         {
 885                                 PyObject *obj = va_arg(vargs, PyObject *);
 886                                 const char *str = va_arg(vargs, const char *);
 887                                 if (obj) {
 888                                         Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 889                                         Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 890                                         s += size;
 891                                 } else {
 892                                         appendstring(str);
 893                                 }
 894                                 break;
 895                         }
 896                         case 'S':
 897                         case 'R':
 898                         {
 899                                 Py_UNICODE *ucopy;
 900                                 Py_ssize_t usize;
 901                                 Py_ssize_t upos;
 902                                 /* unused, since we already have the result */
 903                                 (void) va_arg(vargs, PyObject *);
 904                                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 905                                 usize = PyUnicode_GET_SIZE(*callresult);
 906                                 for (upos = 0; upos<usize;)
 907                                         *s++ = ucopy[upos++];
 908                                 /* We're done with the unicode()/repr() => forget it */
 909                                 Py_DECREF(*callresult);
 910                                 /* switch to next unicode()/repr() result */
 911                                 ++callresult;
 912                                 break;
 913                         }
 914                         case 'p':
 915                                 sprintf(buffer, "%p", va_arg(vargs, void*));
 916                                 /* %p is ill-defined:  ensure leading 0x. */
 917                                 if (buffer[1] == 'X')
 918                                         buffer[1] = 'x';
 919                                 else if (buffer[1] != 'x') {
 920                                         memmove(buffer+2, buffer, strlen(buffer)+1);
 921                                         buffer[0] = '0';
 922                                         buffer[1] = 'x';
 923                                 }
 924                                 appendstring(buffer);
 925                                 break;
 926                         case '%':
 927                                 *s++ = '%';
 928                                 break;
 929                         default:
 930                                 appendstring(p);
 931                                 goto end;
 932                         }
 933                 } else
 934                         *s++ = *f;
 935         }
 936
 937  end:
 938         if (callresults)
 939                 PyObject_Free(callresults);
 940         if (abuffer)
 941                 PyObject_Free(abuffer);
 942         PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 943         return string;
 944  fail:
 945         if (callresults) {
 946                 PyObject **callresult2 = callresults;
 947                 while (callresult2 < callresult) {
 948                         Py_DECREF(*callresult2);
 949                         ++callresult2;
 950                 }
 951                 PyObject_Free(callresults);
 952         }
 953         if (abuffer)
 954                 PyObject_Free(abuffer);
 955         return NULL;
 956 }
 957
 958 #undef appendstring
 959
 960 PyObject *
 961 PyUnicode_FromFormat(const char *format, ...)
 962 {
 963         PyObject* ret;
 964         va_list vargs;
 965
 966 #ifdef HAVE_STDARG_PROTOTYPES
 967         va_start(vargs, format);
 968 #else
 969         va_start(vargs);
 970 #endif
 971         ret = PyUnicode_FromFormatV(format, vargs);
 972         va_end(vargs);
 973         return ret;
 974 }
 975
 976 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 977                                 wchar_t *w,
 978                                 Py_ssize_t size)
 979 {
 980     if (unicode == NULL) {
 981         PyErr_BadInternalCall();
 982         return -1;
 983     }
 984
 985     /* If possible, try to copy the 0-termination as well */
 986     if (size > PyUnicode_GET_SIZE(unicode))
 987         size = PyUnicode_GET_SIZE(unicode) + 1;
 988
 989 #ifdef HAVE_USABLE_WCHAR_T
 990     memcpy(w, unicode->str, size * sizeof(wchar_t));
 991 #else
 992     {
 993         register Py_UNICODE *u;
 994         register Py_ssize_t i;
 995         u = PyUnicode_AS_UNICODE(unicode);
 996         for (i = size; i > 0; i--)
 997             *w++ = *u++;
 998     }
 999 #endif
1000
1001     if (size > PyUnicode_GET_SIZE(unicode))
1002         return PyUnicode_GET_SIZE(unicode);
1003     else
1004     return size;
1005 }
1006
1007 #endif
1008
1009 PyObject *PyUnicode_FromOrdinal(int ordinal)
1010 {
1011     Py_UNICODE s[1];
1012
1013 #ifdef Py_UNICODE_WIDE
1014     if (ordinal < 0 || ordinal > 0x10ffff) {
1015         PyErr_SetString(PyExc_ValueError,
1016                         "unichr() arg not in range(0x110000) "
1017                         "(wide Python build)");
1018         return NULL;
1019     }
1020 #else
1021     if (ordinal < 0 || ordinal > 0xffff) {
1022         PyErr_SetString(PyExc_ValueError,
1023                         "unichr() arg not in range(0x10000) "
1024                         "(narrow Python build)");
1025         return NULL;
1026     }
1027 #endif
1028
1029     s[0] = (Py_UNICODE)ordinal;
1030     return PyUnicode_FromUnicode(s, 1);
1031 }
1032
1033 PyObject *PyUnicode_FromObject(register PyObject *obj)
1034 {
1035     /* XXX Perhaps we should make this API an alias of
1036            PyObject_Unicode() instead ?! */
1037     if (PyUnicode_CheckExact(obj)) {
1038         Py_INCREF(obj);
1039         return obj;
1040     }
1041     if (PyUnicode_Check(obj)) {
1042         /* For a Unicode subtype that's not a Unicode object,
1043            return a true Unicode object with the same data. */
1044         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1045                                      PyUnicode_GET_SIZE(obj));
1046     }
1047     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1048 }
1049
1050 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1051                                       const char *encoding,
1052                                       const char *errors)
1053 {
1054     const char *s = NULL;
1055     Py_ssize_t len;
1056     PyObject *v;
1057
1058     if (obj == NULL) {
1059         PyErr_BadInternalCall();
1060         return NULL;
1061     }
1062
1063 #if 0
1064     /* For b/w compatibility we also accept Unicode objects provided
1065        that no encodings is given and then redirect to
1066        PyObject_Unicode() which then applies the additional logic for
1067        Unicode subclasses.
1068
1069        NOTE: This API should really only be used for object which
1070              represent *encoded* Unicode !
1071
1072     */
1073         if (PyUnicode_Check(obj)) {
1074             if (encoding) {
1075                 PyErr_SetString(PyExc_TypeError,
1076                                 "decoding Unicode is not supported");
1077             return NULL;
1078             }
1079         return PyObject_Unicode(obj);
1080             }
1081 #else
1082     if (PyUnicode_Check(obj)) {
1083         PyErr_SetString(PyExc_TypeError,
1084                         "decoding Unicode is not supported");
1085         return NULL;
1086         }
1087 #endif
1088
1089     /* Coerce object */
1090     if (PyString_Check(obj)) {
1091             s = PyString_AS_STRING(obj);
1092             len = PyString_GET_SIZE(obj);
1093     }
1094     else if (PyByteArray_Check(obj)) {
1095         /* Python 2.x specific */
1096         PyErr_Format(PyExc_TypeError,
1097                      "decoding bytearray is not supported");
1098         return NULL;
1099     }
1100     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1101         /* Overwrite the error message with something more useful in
1102            case of a TypeError. */
1103         if (PyErr_ExceptionMatches(PyExc_TypeError))
1104         PyErr_Format(PyExc_TypeError,
1105                          "coercing to Unicode: need string or buffer, "
1106                          "%.80s found",
1107                      Py_TYPE(obj)->tp_name);
1108         goto onError;
1109     }
1110
1111     /* Convert to Unicode */
1112     if (len == 0) {
1113         Py_INCREF(unicode_empty);
1114         v = (PyObject *)unicode_empty;
1115     }
1116     else
1117         v = PyUnicode_Decode(s, len, encoding, errors);
1118
1119     return v;
1120
1121  onError:
1122     return NULL;
1123 }
1124
1125 PyObject *PyUnicode_Decode(const char *s,
1126                            Py_ssize_t size,
1127                            const char *encoding,
1128                            const char *errors)
1129 {
1130     PyObject *buffer = NULL, *unicode;
1131
1132     if (encoding == NULL)
1133         encoding = PyUnicode_GetDefaultEncoding();
1134
1135     /* Shortcuts for common default encodings */
1136     if (strcmp(encoding, "utf-8") == 0)
1137         return PyUnicode_DecodeUTF8(s, size, errors);
1138     else if (strcmp(encoding, "latin-1") == 0)
1139         return PyUnicode_DecodeLatin1(s, size, errors);
1140 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1141     else if (strcmp(encoding, "mbcs") == 0)
1142         return PyUnicode_DecodeMBCS(s, size, errors);
1143 #endif
1144     else if (strcmp(encoding, "ascii") == 0)
1145         return PyUnicode_DecodeASCII(s, size, errors);
1146
1147     /* Decode via the codec registry */
1148     buffer = PyBuffer_FromMemory((void *)s, size);
1149     if (buffer == NULL)
1150         goto onError;
1151     unicode = PyCodec_Decode(buffer, encoding, errors);
1152     if (unicode == NULL)
1153         goto onError;
1154     if (!PyUnicode_Check(unicode)) {
1155         PyErr_Format(PyExc_TypeError,
1156                      "decoder did not return an unicode object (type=%.400s)",
1157                      Py_TYPE(unicode)->tp_name);
1158         Py_DECREF(unicode);
1159         goto onError;
1160     }
1161     Py_DECREF(buffer);
1162     return unicode;
1163
1164  onError:
1165     Py_XDECREF(buffer);
1166     return NULL;
1167 }
1168
1169 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1170                                     const char *encoding,
1171                                     const char *errors)
1172 {
1173     PyObject *v;
1174
1175     if (!PyUnicode_Check(unicode)) {
1176         PyErr_BadArgument();
1177         goto onError;
1178     }
1179
1180     if (encoding == NULL)
1181         encoding = PyUnicode_GetDefaultEncoding();
1182
1183     /* Decode via the codec registry */
1184     v = PyCodec_Decode(unicode, encoding, errors);
1185     if (v == NULL)
1186         goto onError;
1187     return v;
1188
1189  onError:
1190     return NULL;
1191 }
1192
1193 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1194                            Py_ssize_t size,
1195                            const char *encoding,
1196                            const char *errors)
1197 {
1198     PyObject *v, *unicode;
1199
1200     unicode = PyUnicode_FromUnicode(s, size);
1201     if (unicode == NULL)
1202         return NULL;
1203     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1204     Py_DECREF(unicode);
1205     return v;
1206 }
1207
1208 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1209                                     const char *encoding,
1210                                     const char *errors)
1211 {
1212     PyObject *v;
1213
1214     if (!PyUnicode_Check(unicode)) {
1215         PyErr_BadArgument();
1216         goto onError;
1217     }
1218
1219     if (encoding == NULL)
1220         encoding = PyUnicode_GetDefaultEncoding();
1221
1222     /* Encode via the codec registry */
1223     v = PyCodec_Encode(unicode, encoding, errors);
1224     if (v == NULL)
1225         goto onError;
1226     return v;
1227
1228  onError:
1229     return NULL;
1230 }
1231
1232 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1233                                     const char *encoding,
1234                                     const char *errors)
1235 {
1236     PyObject *v;
1237
1238     if (!PyUnicode_Check(unicode)) {
1239         PyErr_BadArgument();
1240         goto onError;
1241     }
1242
1243     if (encoding == NULL)
1244         encoding = PyUnicode_GetDefaultEncoding();
1245
1246     /* Shortcuts for common default encodings */
1247     if (errors == NULL) {
1248         if (strcmp(encoding, "utf-8") == 0)
1249             return PyUnicode_AsUTF8String(unicode);
1250         else if (strcmp(encoding, "latin-1") == 0)
1251             return PyUnicode_AsLatin1String(unicode);
1252 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1253         else if (strcmp(encoding, "mbcs") == 0)
1254             return PyUnicode_AsMBCSString(unicode);
1255 #endif
1256         else if (strcmp(encoding, "ascii") == 0)
1257             return PyUnicode_AsASCIIString(unicode);
1258     }
1259
1260     /* Encode via the codec registry */
1261     v = PyCodec_Encode(unicode, encoding, errors);
1262     if (v == NULL)
1263         goto onError;
1264     if (!PyString_Check(v)) {
1265         PyErr_Format(PyExc_TypeError,
1266                      "encoder did not return a string object (type=%.400s)",
1267                      Py_TYPE(v)->tp_name);
1268         Py_DECREF(v);
1269         goto onError;
1270     }
1271     return v;
1272
1273  onError:
1274     return NULL;
1275 }
1276
1277 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1278                                             const char *errors)
1279 {
1280     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1281
1282     if (v)
1283         return v;
1284     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1285     if (v && errors == NULL)
1286         ((PyUnicodeObject *)unicode)->defenc = v;
1287     return v;
1288 }
1289
1290 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1291 {
1292     if (!PyUnicode_Check(unicode)) {
1293         PyErr_BadArgument();
1294         goto onError;
1295     }
1296     return PyUnicode_AS_UNICODE(unicode);
1297
1298  onError:
1299     return NULL;
1300 }
1301
1302 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1303 {
1304     if (!PyUnicode_Check(unicode)) {
1305         PyErr_BadArgument();
1306         goto onError;
1307     }
1308     return PyUnicode_GET_SIZE(unicode);
1309
1310  onError:
1311     return -1;
1312 }
1313
1314 const char *PyUnicode_GetDefaultEncoding(void)
1315 {
1316     return unicode_default_encoding;
1317 }
1318
1319 int PyUnicode_SetDefaultEncoding(const char *encoding)
1320 {
1321     PyObject *v;
1322
1323     /* Make sure the encoding is valid. As side effect, this also
1324        loads the encoding into the codec registry cache. */
1325     v = _PyCodec_Lookup(encoding);
1326     if (v == NULL)
1327         goto onError;
1328     Py_DECREF(v);
1329     strncpy(unicode_default_encoding,
1330             encoding,
1331             sizeof(unicode_default_encoding));
1332     return 0;
1333
1334  onError:
1335     return -1;
1336 }
1337
1338 /* error handling callback helper:
1339    build arguments, call the callback and check the arguments,
1340    if no exception occurred, copy the replacement to the output
1341    and adjust various state variables.
1342    return 0 on success, -1 on error
1343 */
1344
1345 static
1346 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1347                  const char *encoding, const char *reason,
1348                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1349                  Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1350                  PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1351 {
1352     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1353
1354     PyObject *restuple = NULL;
1355     PyObject *repunicode = NULL;
1356     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1357     Py_ssize_t requiredsize;
1358     Py_ssize_t newpos;
1359     Py_UNICODE *repptr;
1360     Py_ssize_t repsize;
1361     int res = -1;
1362
1363     if (*errorHandler == NULL) {
1364         *errorHandler = PyCodec_LookupError(errors);
1365         if (*errorHandler == NULL)
1366            goto onError;
1367     }
1368
1369     if (*exceptionObject == NULL) {
1370         *exceptionObject = PyUnicodeDecodeError_Create(
1371             encoding, input, insize, *startinpos, *endinpos, reason);
1372         if (*exceptionObject == NULL)
1373            goto onError;
1374     }
1375     else {
1376         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1377             goto onError;
1378         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1379             goto onError;
1380         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1381             goto onError;
1382     }
1383
1384     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1385     if (restuple == NULL)
1386         goto onError;
1387     if (!PyTuple_Check(restuple)) {
1388         PyErr_Format(PyExc_TypeError, &argparse[4]);
1389         goto onError;
1390     }
1391     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1392         goto onError;
1393     if (newpos<0)
1394         newpos = insize+newpos;
1395     if (newpos<0 || newpos>insize) {
1396         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1397         goto onError;
1398     }
1399
1400     /* need more space? (at least enough for what we
1401        have+the replacement+the rest of the string (starting
1402        at the new input position), so we won't have to check space
1403        when there are no errors in the rest of the string) */
1404     repptr = PyUnicode_AS_UNICODE(repunicode);
1405     repsize = PyUnicode_GET_SIZE(repunicode);
1406     requiredsize = *outpos + repsize + insize-newpos;
1407     if (requiredsize > outsize) {
1408         if (requiredsize<2*outsize)
1409             requiredsize = 2*outsize;
1410         if (_PyUnicode_Resize(output, requiredsize) < 0)
1411             goto onError;
1412         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1413     }
1414     *endinpos = newpos;
1415     *inptr = input + newpos;
1416     Py_UNICODE_COPY(*outptr, repptr, repsize);
1417     *outptr += repsize;
1418     *outpos += repsize;
1419     /* we made it! */
1420     res = 0;
1421
1422     onError:
1423     Py_XDECREF(restuple);
1424     return res;
1425 }
1426
1427 /* --- UTF-7 Codec -------------------------------------------------------- */
1428
1429 /* see RFC2152 for details */
1430
1431 static
1432 char utf7_special[128] = {
1433     /* indicate whether a UTF-7 character is special i.e. cannot be directly
1434        encoded:
1435            0 - not special
1436            1 - special
1437            2 - whitespace (optional)
1438            3 - RFC2152 Set O (optional) */
1439     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1440     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1441     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1442     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1443     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1445     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1446     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1447
1448 };
1449
1450 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1451    warnings about the comparison always being false; since
1452    utf7_special[0] is 1, we can safely make that one comparison
1453    true  */
1454
1455 #define SPECIAL(c, encodeO, encodeWS) \
1456     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1457      (encodeWS && (utf7_special[(c)] == 2)) || \
1458      (encodeO && (utf7_special[(c)] == 3)))
1459
1460 #define B64(n)  \
1461     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1462 #define B64CHAR(c) \
1463     (isalnum(c) || (c) == '+' || (c) == '/')
1464 #define UB64(c) \
1465     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1466      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1467
1468 #define ENCODE(out, ch, bits)                   \
1469     while (bits >= 6) {                         \
1470         *out++ = B64(ch >> (bits-6));           \
1471         bits -= 6;                              \
1472     }
1473
1474 #define DECODE(out, ch, bits, surrogate)                                \
1475     while (bits >= 16) {                                                \
1476         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1477         bits -= 16;                                                     \
1478         if (surrogate) {                                                \
1479             /* We have already generated an error for the high surrogate \
1480                so let's not bother seeing if the low surrogate is correct or not */ \
1481             surrogate = 0;                                              \
1482         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1483             /* This is a surrogate pair. Unfortunately we can't represent \
1484                it in a 16-bit character */                              \
1485             surrogate = 1;                                              \
1486             errmsg = "code pairs are not supported";                    \
1487             goto utf7Error;                                             \
1488         } else {                                                        \
1489             *out++ = outCh;                                             \
1490         }                                                               \
1491     }
1492
1493 PyObject *PyUnicode_DecodeUTF7(const char *s,
1494                                Py_ssize_t size,
1495                                const char *errors)
1496 {
1497     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1498 }
1499
1500 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1501                                Py_ssize_t size,
1502                                const char *errors,
1503                                Py_ssize_t *consumed)
1504 {
1505     const char *starts = s;
1506     Py_ssize_t startinpos;
1507     Py_ssize_t endinpos;
1508     Py_ssize_t outpos;
1509     const char *e;
1510     PyUnicodeObject *unicode;
1511     Py_UNICODE *p;
1512     const char *errmsg = "";
1513     int inShift = 0;
1514     unsigned int bitsleft = 0;
1515     unsigned long charsleft = 0;
1516     int surrogate = 0;
1517     PyObject *errorHandler = NULL;
1518     PyObject *exc = NULL;
1519
1520     unicode = _PyUnicode_New(size);
1521     if (!unicode)
1522         return NULL;
1523     if (size == 0) {
1524         if (consumed)
1525             *consumed = 0;
1526         return (PyObject *)unicode;
1527     }
1528
1529     p = unicode->str;
1530     e = s + size;
1531
1532     while (s < e) {
1533         Py_UNICODE ch;
1534         restart:
1535         ch = (unsigned char) *s;
1536
1537         if (inShift) {
1538             if ((ch == '-') || !B64CHAR(ch)) {
1539                 inShift = 0;
1540                 s++;
1541
1542                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1543                 if (bitsleft >= 6) {
1544                     /* The shift sequence has a partial character in it. If
1545                        bitsleft < 6 then we could just classify it as padding
1546                        but that is not the case here */
1547
1548                     errmsg = "partial character in shift sequence";
1549                     goto utf7Error;
1550                 }
1551                 /* According to RFC2152 the remaining bits should be zero. We
1552                    choose to signal an error/insert a replacement character
1553                    here so indicate the potential of a misencoded character. */
1554
1555                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1556                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1557                     errmsg = "non-zero padding bits in shift sequence";
1558                     goto utf7Error;
1559                 }
1560
1561                 if (ch == '-') {
1562                     if ((s < e) && (*(s) == '-')) {
1563                         *p++ = '-';
1564                         inShift = 1;
1565                     }
1566                 } else if (SPECIAL(ch,0,0)) {
1567                     errmsg = "unexpected special character";
1568                         goto utf7Error;
1569                 } else  {
1570                     *p++ = ch;
1571                 }
1572             } else {
1573                 charsleft = (charsleft << 6) | UB64(ch);
1574                 bitsleft += 6;
1575                 s++;
1576                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1577             }
1578         }
1579         else if ( ch == '+' ) {
1580             startinpos = s-starts;
1581             s++;
1582             if (s < e && *s == '-') {
1583                 s++;
1584                 *p++ = '+';
1585             } else
1586             {
1587                 inShift = 1;
1588                 bitsleft = 0;
1589             }
1590         }
1591         else if (SPECIAL(ch,0,0)) {
1592             startinpos = s-starts;
1593             errmsg = "unexpected special character";
1594             s++;
1595                 goto utf7Error;
1596         }
1597         else {
1598             *p++ = ch;
1599             s++;
1600         }
1601         continue;
1602     utf7Error:
1603         outpos = p-PyUnicode_AS_UNICODE(unicode);
1604         endinpos = s-starts;
1605         if (unicode_decode_call_errorhandler(
1606              errors, &errorHandler,
1607              "utf7", errmsg,
1608              starts, size, &startinpos, &endinpos, &exc, &s,
1609              &unicode, &outpos, &p))
1610         goto onError;
1611     }
1612
1613     if (inShift && !consumed) {
1614         outpos = p-PyUnicode_AS_UNICODE(unicode);
1615         endinpos = size;
1616         if (unicode_decode_call_errorhandler(
1617              errors, &errorHandler,
1618              "utf7", "unterminated shift sequence",
1619              starts, size, &startinpos, &endinpos, &exc, &s,
1620              &unicode, &outpos, &p))
1621             goto onError;
1622         if (s < e)
1623            goto restart;
1624     }
1625     if (consumed) {
1626         if(inShift)
1627             *consumed = startinpos;
1628         else
1629             *consumed = s-starts;
1630     }
1631
1632     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1633         goto onError;
1634
1635     Py_XDECREF(errorHandler);
1636     Py_XDECREF(exc);
1637     return (PyObject *)unicode;
1638
1639 onError:
1640     Py_XDECREF(errorHandler);
1641     Py_XDECREF(exc);
1642     Py_DECREF(unicode);
1643     return NULL;
1644 }
1645
1646
1647 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1648                    Py_ssize_t size,
1649                    int encodeSetO,
1650                    int encodeWhiteSpace,
1651                    const char *errors)
1652 {
1653     PyObject *v;
1654     /* It might be possible to tighten this worst case */
1655     Py_ssize_t cbAllocated = 5 * size;
1656     int inShift = 0;
1657     Py_ssize_t i = 0;
1658     unsigned int bitsleft = 0;
1659     unsigned long charsleft = 0;
1660     char * out;
1661     char * start;
1662
1663     if (cbAllocated / 5 != size)
1664         return PyErr_NoMemory();
1665
1666     if (size == 0)
1667                 return PyString_FromStringAndSize(NULL, 0);
1668
1669     v = PyString_FromStringAndSize(NULL, cbAllocated);
1670     if (v == NULL)
1671         return NULL;
1672
1673     start = out = PyString_AS_STRING(v);
1674     for (;i < size; ++i) {
1675         Py_UNICODE ch = s[i];
1676
1677         if (!inShift) {
1678             if (ch == '+') {
1679                 *out++ = '+';
1680                 *out++ = '-';
1681             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682                 charsleft = ch;
1683                 bitsleft = 16;
1684                 *out++ = '+';
1685                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1686                 inShift = bitsleft > 0;
1687             } else {
1688                 *out++ = (char) ch;
1689             }
1690         } else {
1691             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1692                 *out++ = B64(charsleft << (6-bitsleft));
1693                 charsleft = 0;
1694                 bitsleft = 0;
1695                 /* Characters not in the BASE64 set implicitly unshift the sequence
1696                    so no '-' is required, except if the character is itself a '-' */
1697                 if (B64CHAR(ch) || ch == '-') {
1698                     *out++ = '-';
1699                 }
1700                 inShift = 0;
1701                 *out++ = (char) ch;
1702             } else {
1703                 bitsleft += 16;
1704                 charsleft = (charsleft << 16) | ch;
1705                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1706
1707                 /* If the next character is special then we dont' need to terminate
1708                    the shift sequence. If the next character is not a BASE64 character
1709                    or '-' then the shift sequence will be terminated implicitly and we
1710                    don't have to insert a '-'. */
1711
1712                 if (bitsleft == 0) {
1713                     if (i + 1 < size) {
1714                         Py_UNICODE ch2 = s[i+1];
1715
1716                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1717
1718                         } else if (B64CHAR(ch2) || ch2 == '-') {
1719                             *out++ = '-';
1720                             inShift = 0;
1721                         } else {
1722                             inShift = 0;
1723                         }
1724
1725                     }
1726                     else {
1727                         *out++ = '-';
1728                         inShift = 0;
1729                     }
1730                 }
1731             }
1732         }
1733     }
1734     if (bitsleft) {
1735         *out++= B64(charsleft << (6-bitsleft) );
1736         *out++ = '-';
1737     }
1738
1739     _PyString_Resize(&v, out - start);
1740     return v;
1741 }
1742
1743 #undef SPECIAL
1744 #undef B64
1745 #undef B64CHAR
1746 #undef UB64
1747 #undef ENCODE
1748 #undef DECODE
1749
1750 /* --- UTF-8 Codec -------------------------------------------------------- */
1751
1752 static
1753 char utf8_code_length[256] = {
1754     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1755        illegal prefix.  see RFC 2279 for details */
1756     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1767     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1768     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1769     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1770     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1771     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1772 };
1773
1774 PyObject *PyUnicode_DecodeUTF8(const char *s,
1775                                Py_ssize_t size,
1776                                const char *errors)
1777 {
1778     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1779 }
1780
1781 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1782                                         Py_ssize_t size,
1783                                         const char *errors,
1784                                         Py_ssize_t *consumed)
1785 {
1786     const char *starts = s;
1787     int n;
1788     Py_ssize_t startinpos;
1789     Py_ssize_t endinpos;
1790     Py_ssize_t outpos;
1791     const char *e;
1792     PyUnicodeObject *unicode;
1793     Py_UNICODE *p;
1794     const char *errmsg = "";
1795     PyObject *errorHandler = NULL;
1796     PyObject *exc = NULL;
1797
1798     /* Note: size will always be longer than the resulting Unicode
1799        character count */
1800     unicode = _PyUnicode_New(size);
1801     if (!unicode)
1802         return NULL;
1803     if (size == 0) {
1804         if (consumed)
1805             *consumed = 0;
1806         return (PyObject *)unicode;
1807     }
1808
1809     /* Unpack UTF-8 encoded data */
1810     p = unicode->str;
1811     e = s + size;
1812
1813     while (s < e) {
1814         Py_UCS4 ch = (unsigned char)*s;
1815
1816         if (ch < 0x80) {
1817             *p++ = (Py_UNICODE)ch;
1818             s++;
1819             continue;
1820         }
1821
1822         n = utf8_code_length[ch];
1823
1824         if (s + n > e) {
1825             if (consumed)
1826                 break;
1827             else {
1828                 errmsg = "unexpected end of data";
1829                 startinpos = s-starts;
1830                 endinpos = size;
1831                 goto utf8Error;
1832             }
1833         }
1834
1835         switch (n) {
1836
1837         case 0:
1838             errmsg = "unexpected code byte";
1839             startinpos = s-starts;
1840             endinpos = startinpos+1;
1841             goto utf8Error;
1842
1843         case 1:
1844             errmsg = "internal error";
1845             startinpos = s-starts;
1846             endinpos = startinpos+1;
1847             goto utf8Error;
1848
1849         case 2:
1850             if ((s[1] & 0xc0) != 0x80) {
1851                 errmsg = "invalid data";
1852                 startinpos = s-starts;
1853                 endinpos = startinpos+2;
1854                 goto utf8Error;
1855             }
1856             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1857             if (ch < 0x80) {
1858                 startinpos = s-starts;
1859                 endinpos = startinpos+2;
1860                 errmsg = "illegal encoding";
1861                 goto utf8Error;
1862             }
1863             else
1864                 *p++ = (Py_UNICODE)ch;
1865             break;
1866
1867         case 3:
1868             if ((s[1] & 0xc0) != 0x80 ||
1869                 (s[2] & 0xc0) != 0x80) {
1870                 errmsg = "invalid data";
1871                 startinpos = s-starts;
1872                 endinpos = startinpos+3;
1873                 goto utf8Error;
1874             }
1875             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1876             if (ch < 0x0800) {
1877                 /* Note: UTF-8 encodings of surrogates are considered
1878                    legal UTF-8 sequences;
1879
1880                    XXX For wide builds (UCS-4) we should probably try
1881                        to recombine the surrogates into a single code
1882                        unit.
1883                 */
1884                 errmsg = "illegal encoding";
1885                 startinpos = s-starts;
1886                 endinpos = startinpos+3;
1887                 goto utf8Error;
1888             }
1889             else
1890                 *p++ = (Py_UNICODE)ch;
1891             break;
1892
1893         case 4:
1894             if ((s[1] & 0xc0) != 0x80 ||
1895                 (s[2] & 0xc0) != 0x80 ||
1896                 (s[3] & 0xc0) != 0x80) {
1897                 errmsg = "invalid data";
1898                 startinpos = s-starts;
1899                 endinpos = startinpos+4;
1900                 goto utf8Error;
1901             }
1902             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1903                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1904             /* validate and convert to UTF-16 */
1905             if ((ch < 0x10000)        /* minimum value allowed for 4
1906                                          byte encoding */
1907                 || (ch > 0x10ffff))   /* maximum value allowed for
1908                                          UTF-16 */
1909             {
1910                 errmsg = "illegal encoding";
1911                 startinpos = s-starts;
1912                 endinpos = startinpos+4;
1913                 goto utf8Error;
1914             }
1915 #ifdef Py_UNICODE_WIDE
1916             *p++ = (Py_UNICODE)ch;
1917 #else
1918             /*  compute and append the two surrogates: */
1919
1920             /*  translate from 10000..10FFFF to 0..FFFF */
1921             ch -= 0x10000;
1922
1923             /*  high surrogate = top 10 bits added to D800 */
1924             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1925
1926             /*  low surrogate = bottom 10 bits added to DC00 */
1927             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1928 #endif
1929             break;
1930
1931         default:
1932             /* Other sizes are only needed for UCS-4 */
1933             errmsg = "unsupported Unicode code range";
1934             startinpos = s-starts;
1935             endinpos = startinpos+n;
1936             goto utf8Error;
1937         }
1938         s += n;
1939         continue;
1940
1941     utf8Error:
1942     outpos = p-PyUnicode_AS_UNICODE(unicode);
1943     if (unicode_decode_call_errorhandler(
1944              errors, &errorHandler,
1945              "utf8", errmsg,
1946              starts, size, &startinpos, &endinpos, &exc, &s,
1947              &unicode, &outpos, &p))
1948         goto onError;
1949     }
1950     if (consumed)
1951         *consumed = s-starts;
1952
1953     /* Adjust length */
1954     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1955         goto onError;
1956
1957     Py_XDECREF(errorHandler);
1958     Py_XDECREF(exc);
1959     return (PyObject *)unicode;
1960
1961 onError:
1962     Py_XDECREF(errorHandler);
1963     Py_XDECREF(exc);
1964     Py_DECREF(unicode);
1965     return NULL;
1966 }
1967
1968 /* Allocation strategy:  if the string is short, convert into a stack buffer
1969    and allocate exactly as much space needed at the end.  Else allocate the
1970    maximum possible needed (4 result bytes per Unicode character), and return
1971    the excess memory at the end.
1972 */
1973 PyObject *
1974 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1975                      Py_ssize_t size,
1976                      const char *errors)
1977 {
1978 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1979
1980     Py_ssize_t i;           /* index into s of next input byte */
1981     PyObject *v;        /* result string object */
1982     char *p;            /* next free byte in output buffer */
1983     Py_ssize_t nallocated;  /* number of result bytes allocated */
1984     Py_ssize_t nneeded;        /* number of result bytes needed */
1985     char stackbuf[MAX_SHORT_UNICHARS * 4];
1986
1987     assert(s != NULL);
1988     assert(size >= 0);
1989
1990     if (size <= MAX_SHORT_UNICHARS) {
1991         /* Write into the stack buffer; nallocated can't overflow.
1992          * At the end, we'll allocate exactly as much heap space as it
1993          * turns out we need.
1994          */
1995         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1996         v = NULL;   /* will allocate after we're done */
1997         p = stackbuf;
1998     }
1999     else {
2000         /* Overallocate on the heap, and give the excess back at the end. */
2001         nallocated = size * 4;
2002         if (nallocated / 4 != size)  /* overflow! */
2003             return PyErr_NoMemory();
2004         v = PyString_FromStringAndSize(NULL, nallocated);
2005         if (v == NULL)
2006             return NULL;
2007         p = PyString_AS_STRING(v);
2008     }
2009
2010     for (i = 0; i < size;) {
2011         Py_UCS4 ch = s[i++];
2012
2013         if (ch < 0x80)
2014             /* Encode ASCII */
2015             *p++ = (char) ch;
2016
2017         else if (ch < 0x0800) {
2018             /* Encode Latin-1 */
2019             *p++ = (char)(0xc0 | (ch >> 6));
2020             *p++ = (char)(0x80 | (ch & 0x3f));
2021         }
2022         else {
2023             /* Encode UCS2 Unicode ordinals */
2024             if (ch < 0x10000) {
2025                 /* Special case: check for high surrogate */
2026                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2027                     Py_UCS4 ch2 = s[i];
2028                     /* Check for low surrogate and combine the two to
2029                        form a UCS4 value */
2030                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2031                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2032                         i++;
2033                         goto encodeUCS4;
2034                     }
2035                     /* Fall through: handles isolated high surrogates */
2036                 }
2037                 *p++ = (char)(0xe0 | (ch >> 12));
2038                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2039                 *p++ = (char)(0x80 | (ch & 0x3f));
2040                 continue;
2041             }
2042 encodeUCS4:
2043             /* Encode UCS4 Unicode ordinals */
2044             *p++ = (char)(0xf0 | (ch >> 18));
2045             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2046             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2047             *p++ = (char)(0x80 | (ch & 0x3f));
2048         }
2049     }
2050
2051     if (v == NULL) {
2052         /* This was stack allocated. */
2053         nneeded = p - stackbuf;
2054         assert(nneeded <= nallocated);
2055         v = PyString_FromStringAndSize(stackbuf, nneeded);
2056     }
2057     else {
2058         /* Cut back to size actually needed. */
2059         nneeded = p - PyString_AS_STRING(v);
2060         assert(nneeded <= nallocated);
2061         _PyString_Resize(&v, nneeded);
2062     }
2063     return v;
2064
2065 #undef MAX_SHORT_UNICHARS
2066 }
2067
2068 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2069 {
2070     if (!PyUnicode_Check(unicode)) {
2071         PyErr_BadArgument();
2072         return NULL;
2073     }
2074     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2075                                 PyUnicode_GET_SIZE(unicode),
2076                                 NULL);
2077 }
2078
2079 /* --- UTF-32 Codec ------------------------------------------------------- */
2080
2081 PyObject *
2082 PyUnicode_DecodeUTF32(const char *s,
2083                       Py_ssize_t size,
2084                       const char *errors,
2085                       int *byteorder)
2086 {
2087     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2088 }
2089
2090 PyObject *
2091 PyUnicode_DecodeUTF32Stateful(const char *s,
2092                               Py_ssize_t size,
2093                               const char *errors,
2094                               int *byteorder,
2095                               Py_ssize_t *consumed)
2096 {
2097     const char *starts = s;
2098     Py_ssize_t startinpos;
2099     Py_ssize_t endinpos;
2100     Py_ssize_t outpos;
2101     PyUnicodeObject *unicode;
2102     Py_UNICODE *p;
2103 #ifndef Py_UNICODE_WIDE
2104     int i, pairs;
2105 #else
2106     const int pairs = 0;
2107 #endif
2108     const unsigned char *q, *e;
2109     int bo = 0;       /* assume native ordering by default */
2110     const char *errmsg = "";
2111     /* Offsets from q for retrieving bytes in the right order. */
2112 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2113     int iorder[] = {0, 1, 2, 3};
2114 #else
2115     int iorder[] = {3, 2, 1, 0};
2116 #endif
2117     PyObject *errorHandler = NULL;
2118     PyObject *exc = NULL;
2119     /* On narrow builds we split characters outside the BMP into two
2120        codepoints => count how much extra space we need. */
2121 #ifndef Py_UNICODE_WIDE
2122     for (i = pairs = 0; i < size/4; i++)
2123         if (((Py_UCS4 *)s)[i] >= 0x10000)
2124             pairs++;
2125 #endif
2126
2127     /* This might be one to much, because of a BOM */
2128     unicode = _PyUnicode_New((size+3)/4+pairs);
2129     if (!unicode)
2130         return NULL;
2131     if (size == 0)
2132         return (PyObject *)unicode;
2133
2134     /* Unpack UTF-32 encoded data */
2135     p = unicode->str;
2136     q = (unsigned char *)s;
2137     e = q + size;
2138
2139     if (byteorder)
2140         bo = *byteorder;
2141
2142     /* Check for BOM marks (U+FEFF) in the input and adjust current
2143        byte order setting accordingly. In native mode, the leading BOM
2144        mark is skipped, in all other modes, it is copied to the output
2145        stream as-is (giving a ZWNBSP character). */
2146     if (bo == 0) {
2147         if (size >= 4) {
2148             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2149                                 (q[iorder[1]] << 8) | q[iorder[0]];
2150 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2151             if (bom == 0x0000FEFF) {
2152                 q += 4;
2153                 bo = -1;
2154             }
2155             else if (bom == 0xFFFE0000) {
2156                 q += 4;
2157                 bo = 1;
2158             }
2159 #else
2160             if (bom == 0x0000FEFF) {
2161                 q += 4;
2162                 bo = 1;
2163             }
2164             else if (bom == 0xFFFE0000) {
2165                 q += 4;
2166                 bo = -1;
2167             }
2168 #endif
2169         }
2170     }
2171
2172     if (bo == -1) {
2173         /* force LE */
2174         iorder[0] = 0;
2175         iorder[1] = 1;
2176         iorder[2] = 2;
2177         iorder[3] = 3;
2178     }
2179     else if (bo == 1) {
2180         /* force BE */
2181         iorder[0] = 3;
2182         iorder[1] = 2;
2183         iorder[2] = 1;
2184         iorder[3] = 0;
2185     }
2186
2187     while (q < e) {
2188         Py_UCS4 ch;
2189         /* remaining bytes at the end? (size should be divisible by 4) */
2190         if (e-q<4) {
2191             if (consumed)
2192                 break;
2193             errmsg = "truncated data";
2194             startinpos = ((const char *)q)-starts;
2195             endinpos = ((const char *)e)-starts;
2196             goto utf32Error;
2197             /* The remaining input chars are ignored if the callback
2198                chooses to skip the input */
2199         }
2200         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2201              (q[iorder[1]] << 8) | q[iorder[0]];
2202
2203         if (ch >= 0x110000)
2204         {
2205             errmsg = "codepoint not in range(0x110000)";
2206             startinpos = ((const char *)q)-starts;
2207             endinpos = startinpos+4;
2208             goto utf32Error;
2209         }
2210 #ifndef Py_UNICODE_WIDE
2211         if (ch >= 0x10000)
2212         {
2213             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2214             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2215         }
2216         else
2217 #endif
2218             *p++ = ch;
2219         q += 4;
2220         continue;
2221     utf32Error:
2222         outpos = p-PyUnicode_AS_UNICODE(unicode);
2223     if (unicode_decode_call_errorhandler(
2224          errors, &errorHandler,
2225          "utf32", errmsg,
2226          starts, size, &startinpos, &endinpos, &exc, &s,
2227          &unicode, &outpos, &p))
2228             goto onError;
2229     }
2230
2231     if (byteorder)
2232         *byteorder = bo;
2233
2234     if (consumed)
2235         *consumed = (const char *)q-starts;
2236
2237     /* Adjust length */
2238     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2239         goto onError;
2240
2241     Py_XDECREF(errorHandler);
2242     Py_XDECREF(exc);
2243     return (PyObject *)unicode;
2244
2245 onError:
2246     Py_DECREF(unicode);
2247     Py_XDECREF(errorHandler);
2248     Py_XDECREF(exc);
2249     return NULL;
2250 }
2251
2252 PyObject *
2253 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2254                       Py_ssize_t size,
2255                       const char *errors,
2256                       int byteorder)
2257 {
2258     PyObject *v;
2259     unsigned char *p;
2260     Py_ssize_t nsize, bytesize;
2261 #ifndef Py_UNICODE_WIDE
2262     Py_ssize_t i, pairs;
2263 #else
2264     const int pairs = 0;
2265 #endif
2266     /* Offsets from p for storing byte pairs in the right order. */
2267 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2268     int iorder[] = {0, 1, 2, 3};
2269 #else
2270     int iorder[] = {3, 2, 1, 0};
2271 #endif
2272
2273 #define STORECHAR(CH)                       \
2274     do {                                    \
2275         p[iorder[3]] = ((CH) >> 24) & 0xff; \
2276         p[iorder[2]] = ((CH) >> 16) & 0xff; \
2277         p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2278         p[iorder[0]] = (CH) & 0xff;         \
2279         p += 4;                             \
2280     } while(0)
2281
2282     /* In narrow builds we can output surrogate pairs as one codepoint,
2283        so we need less space. */
2284 #ifndef Py_UNICODE_WIDE
2285     for (i = pairs = 0; i < size-1; i++)
2286         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2287             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2288             pairs++;
2289 #endif
2290     nsize = (size - pairs + (byteorder == 0));
2291     bytesize = nsize * 4;
2292     if (bytesize / 4 != nsize)
2293         return PyErr_NoMemory();
2294     v = PyString_FromStringAndSize(NULL, bytesize);
2295     if (v == NULL)
2296         return NULL;
2297
2298     p = (unsigned char *)PyString_AS_STRING(v);
2299     if (byteorder == 0)
2300         STORECHAR(0xFEFF);
2301     if (size == 0)
2302         return v;
2303
2304     if (byteorder == -1) {
2305         /* force LE */
2306         iorder[0] = 0;
2307         iorder[1] = 1;
2308         iorder[2] = 2;
2309         iorder[3] = 3;
2310     }
2311     else if (byteorder == 1) {
2312         /* force BE */
2313         iorder[0] = 3;
2314         iorder[1] = 2;
2315         iorder[2] = 1;
2316         iorder[3] = 0;
2317     }
2318
2319     while (size-- > 0) {
2320         Py_UCS4 ch = *s++;
2321 #ifndef Py_UNICODE_WIDE
2322         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2323             Py_UCS4 ch2 = *s;
2324             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2325                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2326                 s++;
2327                 size--;
2328             }
2329         }
2330 #endif
2331         STORECHAR(ch);
2332     }
2333     return v;
2334 #undef STORECHAR
2335 }
2336
2337 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2338 {
2339     if (!PyUnicode_Check(unicode)) {
2340         PyErr_BadArgument();
2341         return NULL;
2342     }
2343     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2344                                  PyUnicode_GET_SIZE(unicode),
2345                                  NULL,
2346                                  0);
2347 }
2348
2349 /* --- UTF-16 Codec ------------------------------------------------------- */
2350
2351 PyObject *
2352 PyUnicode_DecodeUTF16(const char *s,
2353                       Py_ssize_t size,
2354                       const char *errors,
2355                       int *byteorder)
2356 {
2357     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2358 }
2359
2360 PyObject *
2361 PyUnicode_DecodeUTF16Stateful(const char *s,
2362                               Py_ssize_t size,
2363                               const char *errors,
2364                               int *byteorder,
2365                               Py_ssize_t *consumed)
2366 {
2367     const char *starts = s;
2368     Py_ssize_t startinpos;
2369     Py_ssize_t endinpos;
2370     Py_ssize_t outpos;
2371     PyUnicodeObject *unicode;
2372     Py_UNICODE *p;
2373     const unsigned char *q, *e;
2374     int bo = 0;       /* assume native ordering by default */
2375     const char *errmsg = "";
2376     /* Offsets from q for retrieving byte pairs in the right order. */
2377 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2378     int ihi = 1, ilo = 0;
2379 #else
2380     int ihi = 0, ilo = 1;
2381 #endif
2382     PyObject *errorHandler = NULL;
2383     PyObject *exc = NULL;
2384
2385     /* Note: size will always be longer than the resulting Unicode
2386        character count */
2387     unicode = _PyUnicode_New(size);
2388     if (!unicode)
2389         return NULL;
2390     if (size == 0)
2391         return (PyObject *)unicode;
2392
2393     /* Unpack UTF-16 encoded data */
2394     p = unicode->str;
2395     q = (unsigned char *)s;
2396     e = q + size;
2397
2398     if (byteorder)
2399         bo = *byteorder;
2400
2401     /* Check for BOM marks (U+FEFF) in the input and adjust current
2402        byte order setting accordingly. In native mode, the leading BOM
2403        mark is skipped, in all other modes, it is copied to the output
2404        stream as-is (giving a ZWNBSP character). */
2405     if (bo == 0) {
2406         if (size >= 2) {
2407             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2408 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2409             if (bom == 0xFEFF) {
2410                 q += 2;
2411                 bo = -1;
2412             }
2413             else if (bom == 0xFFFE) {
2414                 q += 2;
2415                 bo = 1;
2416             }
2417 #else
2418             if (bom == 0xFEFF) {
2419                 q += 2;
2420                 bo = 1;
2421             }
2422             else if (bom == 0xFFFE) {
2423                 q += 2;
2424                 bo = -1;
2425             }
2426 #endif
2427         }
2428     }
2429
2430     if (bo == -1) {
2431         /* force LE */
2432         ihi = 1;
2433         ilo = 0;
2434     }
2435     else if (bo == 1) {
2436         /* force BE */
2437         ihi = 0;
2438         ilo = 1;
2439     }
2440
2441     while (q < e) {
2442         Py_UNICODE ch;
2443         /* remaining bytes at the end? (size should be even) */
2444         if (e-q<2) {
2445             if (consumed)
2446                 break;
2447             errmsg = "truncated data";
2448             startinpos = ((const char *)q)-starts;
2449             endinpos = ((const char *)e)-starts;
2450             goto utf16Error;
2451             /* The remaining input chars are ignored if the callback
2452                chooses to skip the input */
2453         }
2454         ch = (q[ihi] << 8) | q[ilo];
2455
2456         q += 2;
2457
2458         if (ch < 0xD800 || ch > 0xDFFF) {
2459             *p++ = ch;
2460             continue;
2461         }
2462
2463         /* UTF-16 code pair: */
2464         if (q >= e) {
2465             errmsg = "unexpected end of data";
2466             startinpos = (((const char *)q)-2)-starts;
2467             endinpos = ((const char *)e)-starts;
2468             goto utf16Error;
2469         }
2470         if (0xD800 <= ch && ch <= 0xDBFF) {
2471             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2472             q += 2;
2473             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2474 #ifndef Py_UNICODE_WIDE
2475                 *p++ = ch;
2476                 *p++ = ch2;
2477 #else
2478                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2479 #endif
2480                 continue;
2481             }
2482             else {
2483                 errmsg = "illegal UTF-16 surrogate";
2484                 startinpos = (((const char *)q)-4)-starts;
2485                 endinpos = startinpos+2;
2486                 goto utf16Error;
2487             }
2488
2489         }
2490         errmsg = "illegal encoding";
2491         startinpos = (((const char *)q)-2)-starts;
2492         endinpos = startinpos+2;
2493         /* Fall through to report the error */
2494
2495     utf16Error:
2496         outpos = p-PyUnicode_AS_UNICODE(unicode);
2497         if (unicode_decode_call_errorhandler(
2498                  errors, &errorHandler,
2499                  "utf16", errmsg,
2500                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2501                  &unicode, &outpos, &p))
2502             goto onError;
2503     }
2504
2505     if (byteorder)
2506         *byteorder = bo;
2507
2508     if (consumed)
2509         *consumed = (const char *)q-starts;
2510
2511     /* Adjust length */
2512     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2513         goto onError;
2514
2515     Py_XDECREF(errorHandler);
2516     Py_XDECREF(exc);
2517     return (PyObject *)unicode;
2518
2519 onError:
2520     Py_DECREF(unicode);
2521     Py_XDECREF(errorHandler);
2522     Py_XDECREF(exc);
2523     return NULL;
2524 }
2525
2526 PyObject *
2527 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2528                       Py_ssize_t size,
2529                       const char *errors,
2530                       int byteorder)
2531 {
2532     PyObject *v;
2533     unsigned char *p;
2534     Py_ssize_t nsize, bytesize;
2535 #ifdef Py_UNICODE_WIDE
2536     Py_ssize_t i, pairs;
2537 #else
2538     const int pairs = 0;
2539 #endif
2540     /* Offsets from p for storing byte pairs in the right order. */
2541 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2542     int ihi = 1, ilo = 0;
2543 #else
2544     int ihi = 0, ilo = 1;
2545 #endif
2546
2547 #define STORECHAR(CH)                   \
2548     do {                                \
2549         p[ihi] = ((CH) >> 8) & 0xff;    \
2550         p[ilo] = (CH) & 0xff;           \
2551         p += 2;                         \
2552     } while(0)
2553
2554 #ifdef Py_UNICODE_WIDE
2555     for (i = pairs = 0; i < size; i++)
2556         if (s[i] >= 0x10000)
2557             pairs++;
2558 #endif
2559     /* 2 * (size + pairs + (byteorder == 0)) */
2560     if (size > PY_SSIZE_T_MAX ||
2561         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2562         return PyErr_NoMemory();
2563     nsize = size + pairs + (byteorder == 0);
2564     bytesize = nsize * 2;
2565     if (bytesize / 2 != nsize)
2566         return PyErr_NoMemory();
2567     v = PyString_FromStringAndSize(NULL, bytesize);
2568     if (v == NULL)
2569         return NULL;
2570
2571     p = (unsigned char *)PyString_AS_STRING(v);
2572     if (byteorder == 0)
2573         STORECHAR(0xFEFF);
2574     if (size == 0)
2575         return v;
2576
2577     if (byteorder == -1) {
2578         /* force LE */
2579         ihi = 1;
2580         ilo = 0;
2581     }
2582     else if (byteorder == 1) {
2583         /* force BE */
2584         ihi = 0;
2585         ilo = 1;
2586     }
2587
2588     while (size-- > 0) {
2589         Py_UNICODE ch = *s++;
2590         Py_UNICODE ch2 = 0;
2591 #ifdef Py_UNICODE_WIDE
2592         if (ch >= 0x10000) {
2593             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2594             ch  = 0xD800 | ((ch-0x10000) >> 10);
2595         }
2596 #endif
2597         STORECHAR(ch);
2598         if (ch2)
2599             STORECHAR(ch2);
2600     }
2601     return v;
2602 #undef STORECHAR
2603 }
2604
2605 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2606 {
2607     if (!PyUnicode_Check(unicode)) {
2608         PyErr_BadArgument();
2609         return NULL;
2610     }
2611     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2612                                  PyUnicode_GET_SIZE(unicode),
2613                                  NULL,
2614                                  0);
2615 }
2616
2617 /* --- Unicode Escape Codec ----------------------------------------------- */
2618
2619 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2620
2621 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2622                                         Py_ssize_t size,
2623                                         const char *errors)
2624 {
2625     const char *starts = s;
2626     Py_ssize_t startinpos;
2627     Py_ssize_t endinpos;
2628     Py_ssize_t outpos;
2629     int i;
2630     PyUnicodeObject *v;
2631     Py_UNICODE *p;
2632     const char *end;
2633     char* message;
2634     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2635     PyObject *errorHandler = NULL;
2636     PyObject *exc = NULL;
2637
2638     /* Escaped strings will always be longer than the resulting
2639        Unicode string, so we start with size here and then reduce the
2640        length after conversion to the true value.
2641        (but if the error callback returns a long replacement string
2642        we'll have to allocate more space) */
2643     v = _PyUnicode_New(size);
2644     if (v == NULL)
2645         goto onError;
2646     if (size == 0)
2647         return (PyObject *)v;
2648
2649     p = PyUnicode_AS_UNICODE(v);
2650     end = s + size;
2651
2652     while (s < end) {
2653         unsigned char c;
2654         Py_UNICODE x;
2655         int digits;
2656
2657         /* Non-escape characters are interpreted as Unicode ordinals */
2658         if (*s != '\\') {
2659             *p++ = (unsigned char) *s++;
2660             continue;
2661         }
2662
2663         startinpos = s-starts;
2664         /* \ - Escapes */
2665         s++;
2666         c = *s++;
2667         if (s > end)
2668             c = '\0'; /* Invalid after \ */
2669         switch (c) {
2670
2671         /* \x escapes */
2672         case '\n': break;
2673         case '\\': *p++ = '\\'; break;
2674         case '\'': *p++ = '\''; break;
2675         case '\"': *p++ = '\"'; break;
2676         case 'b': *p++ = '\b'; break;
2677         case 'f': *p++ = '\014'; break; /* FF */
2678         case 't': *p++ = '\t'; break;
2679         case 'n': *p++ = '\n'; break;
2680         case 'r': *p++ = '\r'; break;
2681         case 'v': *p++ = '\013'; break; /* VT */
2682         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2683
2684         /* \OOO (octal) escapes */
2685         case '0': case '1': case '2': case '3':
2686         case '4': case '5': case '6': case '7':
2687             x = s[-1] - '0';
2688             if (s < end && '0' <= *s && *s <= '7') {
2689                 x = (x<<3) + *s++ - '0';
2690                 if (s < end && '0' <= *s && *s <= '7')
2691                     x = (x<<3) + *s++ - '0';
2692             }
2693             *p++ = x;
2694             break;
2695
2696         /* hex escapes */
2697         /* \xXX */
2698         case 'x':
2699             digits = 2;
2700             message = "truncated \\xXX escape";
2701             goto hexescape;
2702
2703         /* \uXXXX */
2704         case 'u':
2705             digits = 4;
2706             message = "truncated \\uXXXX escape";
2707             goto hexescape;
2708
2709         /* \UXXXXXXXX */
2710         case 'U':
2711             digits = 8;
2712             message = "truncated \\UXXXXXXXX escape";
2713         hexescape:
2714             chr = 0;
2715             outpos = p-PyUnicode_AS_UNICODE(v);
2716             if (s+digits>end) {
2717                 endinpos = size;
2718                 if (unicode_decode_call_errorhandler(
2719                     errors, &errorHandler,
2720                     "unicodeescape", "end of string in escape sequence",
2721                     starts, size, &startinpos, &endinpos, &exc, &s,
2722                     &v, &outpos, &p))
2723                     goto onError;
2724                 goto nextByte;
2725             }
2726             for (i = 0; i < digits; ++i) {
2727                 c = (unsigned char) s[i];
2728                 if (!isxdigit(c)) {
2729                     endinpos = (s+i+1)-starts;
2730                     if (unicode_decode_call_errorhandler(
2731                         errors, &errorHandler,
2732                         "unicodeescape", message,
2733                         starts, size, &startinpos, &endinpos, &exc, &s,
2734                         &v, &outpos, &p))
2735                         goto onError;
2736                     goto nextByte;
2737                 }
2738                 chr = (chr<<4) & ~0xF;
2739                 if (c >= '0' && c <= '9')
2740                     chr += c - '0';
2741                 else if (c >= 'a' && c <= 'f')
2742                     chr += 10 + c - 'a';
2743                 else
2744                     chr += 10 + c - 'A';
2745             }
2746             s += i;
2747             if (chr == 0xffffffff && PyErr_Occurred())
2748                 /* _decoding_error will have already written into the
2749                    target buffer. */
2750                 break;
2751         store:
2752             /* when we get here, chr is a 32-bit unicode character */
2753             if (chr <= 0xffff)
2754                 /* UCS-2 character */
2755                 *p++ = (Py_UNICODE) chr;
2756             else if (chr <= 0x10ffff) {
2757                 /* UCS-4 character. Either store directly, or as
2758                    surrogate pair. */
2759 #ifdef Py_UNICODE_WIDE
2760                 *p++ = chr;
2761 #else
2762                 chr -= 0x10000L;
2763                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2764                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2765 #endif
2766             } else {
2767                 endinpos = s-starts;
2768                 outpos = p-PyUnicode_AS_UNICODE(v);
2769                 if (unicode_decode_call_errorhandler(
2770                     errors, &errorHandler,
2771                     "unicodeescape", "illegal Unicode character",
2772                     starts, size, &startinpos, &endinpos, &exc, &s,
2773                     &v, &outpos, &p))
2774                     goto onError;
2775             }
2776             break;
2777
2778         /* \N{name} */
2779         case 'N':
2780             message = "malformed \\N character escape";
2781             if (ucnhash_CAPI == NULL) {
2782                 /* load the unicode data module */
2783                 PyObject *m, *api;
2784                 m = PyImport_ImportModuleNoBlock("unicodedata");
2785                 if (m == NULL)
2786                     goto ucnhashError;
2787                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2788                 Py_DECREF(m);
2789                 if (api == NULL)
2790                     goto ucnhashError;
2791                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2792                 Py_DECREF(api);
2793                 if (ucnhash_CAPI == NULL)
2794                     goto ucnhashError;
2795             }
2796             if (*s == '{') {
2797                 const char *start = s+1;
2798                 /* look for the closing brace */
2799                 while (*s != '}' && s < end)
2800                     s++;
2801                 if (s > start && s < end && *s == '}') {
2802                     /* found a name.  look it up in the unicode database */
2803                     message = "unknown Unicode character name";
2804                     s++;
2805                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2806                         goto store;
2807                 }
2808             }
2809             endinpos = s-starts;
2810             outpos = p-PyUnicode_AS_UNICODE(v);
2811             if (unicode_decode_call_errorhandler(
2812                 errors, &errorHandler,
2813                 "unicodeescape", message,
2814                 starts, size, &startinpos, &endinpos, &exc, &s,
2815                 &v, &outpos, &p))
2816                 goto onError;
2817             break;
2818
2819         default:
2820             if (s > end) {
2821                 message = "\\ at end of string";
2822                 s--;
2823                 endinpos = s-starts;
2824                 outpos = p-PyUnicode_AS_UNICODE(v);
2825                 if (unicode_decode_call_errorhandler(
2826                     errors, &errorHandler,
2827                     "unicodeescape", message,
2828                     starts, size, &startinpos, &endinpos, &exc, &s,
2829                     &v, &outpos, &p))
2830                     goto onError;
2831             }
2832             else {
2833                 *p++ = '\\';
2834                 *p++ = (unsigned char)s[-1];
2835             }
2836             break;
2837         }
2838         nextByte:
2839         ;
2840     }
2841     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2842         goto onError;
2843     Py_XDECREF(errorHandler);
2844     Py_XDECREF(exc);
2845     return (PyObject *)v;
2846
2847 ucnhashError:
2848     PyErr_SetString(
2849         PyExc_UnicodeError,
2850         "\\N escapes not supported (can't load unicodedata module)"
2851         );
2852     Py_XDECREF(v);
2853     Py_XDECREF(errorHandler);
2854     Py_XDECREF(exc);
2855     return NULL;
2856
2857 onError:
2858     Py_XDECREF(v);
2859     Py_XDECREF(errorHandler);
2860     Py_XDECREF(exc);
2861     return NULL;
2862 }
2863
2864 /* Return a Unicode-Escape string version of the Unicode object.
2865
2866    If quotes is true, the string is enclosed in u"" or u'' quotes as
2867    appropriate.
2868
2869 */
2870
2871 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2872                                       Py_ssize_t size,
2873                                       Py_UNICODE ch)
2874 {
2875     /* like wcschr, but doesn't stop at NULL characters */
2876
2877     while (size-- > 0) {
2878         if (*s == ch)
2879             return s;
2880         s++;
2881     }
2882
2883     return NULL;
2884 }
2885
2886 static
2887 PyObject *unicodeescape_string(const Py_UNICODE *s,
2888                                Py_ssize_t size,
2889                                int quotes)
2890 {
2891     PyObject *repr;
2892     char *p;
2893
2894     static const char *hexdigit = "0123456789abcdef";
2895 #ifdef Py_UNICODE_WIDE
2896     const Py_ssize_t expandsize = 10;
2897 #else
2898     const Py_ssize_t expandsize = 6;
2899 #endif
2900
2901     /* XXX(nnorwitz): rather than over-allocating, it would be
2902        better to choose a different scheme.  Perhaps scan the
2903        first N-chars of the string and allocate based on that size.
2904     */
2905     /* Initial allocation is based on the longest-possible unichr
2906        escape.
2907
2908        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2909        unichr, so in this case it's the longest unichr escape. In
2910        narrow (UTF-16) builds this is five chars per source unichr
2911        since there are two unichrs in the surrogate pair, so in narrow
2912        (UTF-16) builds it's not the longest unichr escape.
2913
2914        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2915        so in the narrow (UTF-16) build case it's the longest unichr
2916        escape.
2917     */
2918
2919     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2920         return PyErr_NoMemory();
2921
2922     repr = PyString_FromStringAndSize(NULL,
2923         2
2924         + expandsize*size
2925         + 1);
2926     if (repr == NULL)
2927         return NULL;
2928
2929     p = PyString_AS_STRING(repr);
2930
2931     if (quotes) {
2932         *p++ = 'u';
2933         *p++ = (findchar(s, size, '\'') &&
2934                 !findchar(s, size, '"')) ? '"' : '\'';
2935     }
2936     while (size-- > 0) {
2937         Py_UNICODE ch = *s++;
2938
2939         /* Escape quotes and backslashes */
2940         if ((quotes &&
2941              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2942             *p++ = '\\';
2943             *p++ = (char) ch;
2944             continue;
2945         }
2946
2947 #ifdef Py_UNICODE_WIDE
2948         /* Map 21-bit characters to '\U00xxxxxx' */
2949         else if (ch >= 0x10000) {
2950             *p++ = '\\';
2951             *p++ = 'U';
2952             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2953             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2954             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2955             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2956             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2957             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2958             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2959             *p++ = hexdigit[ch & 0x0000000F];
2960             continue;
2961         }
2962 #else
2963         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2964         else if (ch >= 0xD800 && ch < 0xDC00) {
2965             Py_UNICODE ch2;
2966             Py_UCS4 ucs;
2967
2968             ch2 = *s++;
2969             size--;
2970             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2971                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2972                 *p++ = '\\';
2973                 *p++ = 'U';
2974                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2975                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2976                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2977                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2978                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2979                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2980                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2981                 *p++ = hexdigit[ucs & 0x0000000F];
2982                 continue;
2983             }
2984             /* Fall through: isolated surrogates are copied as-is */
2985             s--;
2986             size++;
2987         }
2988 #endif
2989
2990         /* Map 16-bit characters to '\uxxxx' */
2991         if (ch >= 256) {
2992             *p++ = '\\';
2993             *p++ = 'u';
2994             *p++ = hexdigit[(ch >> 12) & 0x000F];
2995             *p++ = hexdigit[(ch >> 8) & 0x000F];
2996             *p++ = hexdigit[(ch >> 4) & 0x000F];
2997             *p++ = hexdigit[ch & 0x000F];
2998         }
2999
3000         /* Map special whitespace to '\t', \n', '\r' */
3001         else if (ch == '\t') {
3002             *p++ = '\\';
3003             *p++ = 't';
3004         }
3005         else if (ch == '\n') {
3006             *p++ = '\\';
3007             *p++ = 'n';
3008         }
3009         else if (ch == '\r') {
3010             *p++ = '\\';
3011             *p++ = 'r';
3012         }
3013
3014         /* Map non-printable US ASCII to '\xhh' */
3015         else if (ch < ' ' || ch >= 0x7F) {
3016             *p++ = '\\';
3017             *p++ = 'x';
3018             *p++ = hexdigit[(ch >> 4) & 0x000F];
3019             *p++ = hexdigit[ch & 0x000F];
3020         }
3021
3022         /* Copy everything else as-is */
3023         else
3024             *p++ = (char) ch;
3025     }
3026     if (quotes)
3027         *p++ = PyString_AS_STRING(repr)[1];
3028
3029     *p = '\0';
3030     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3031     return repr;
3032 }
3033
3034 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3035                                         Py_ssize_t size)
3036 {
3037     return unicodeescape_string(s, size, 0);
3038 }
3039
3040 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3041 {
3042     if (!PyUnicode_Check(unicode)) {
3043         PyErr_BadArgument();
3044         return NULL;
3045     }
3046     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3047                                          PyUnicode_GET_SIZE(unicode));
3048 }
3049
3050 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3051
3052 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3053                                            Py_ssize_t size,
3054                                            const char *errors)
3055 {
3056     const char *starts = s;
3057     Py_ssize_t startinpos;
3058     Py_ssize_t endinpos;
3059     Py_ssize_t outpos;
3060     PyUnicodeObject *v;
3061     Py_UNICODE *p;
3062     const char *end;
3063     const char *bs;
3064     PyObject *errorHandler = NULL;
3065     PyObject *exc = NULL;
3066
3067     /* Escaped strings will always be longer than the resulting
3068        Unicode string, so we start with size here and then reduce the
3069        length after conversion to the true value. (But decoding error
3070        handler might have to resize the string) */
3071     v = _PyUnicode_New(size);
3072     if (v == NULL)
3073         goto onError;
3074     if (size == 0)
3075         return (PyObject *)v;
3076     p = PyUnicode_AS_UNICODE(v);
3077     end = s + size;
3078     while (s < end) {
3079         unsigned char c;
3080         Py_UCS4 x;
3081         int i;
3082         int count;
3083
3084         /* Non-escape characters are interpreted as Unicode ordinals */
3085         if (*s != '\\') {
3086             *p++ = (unsigned char)*s++;
3087             continue;
3088         }
3089         startinpos = s-starts;
3090
3091         /* \u-escapes are only interpreted iff the number of leading
3092            backslashes if odd */
3093         bs = s;
3094         for (;s < end;) {
3095             if (*s != '\\')
3096                 break;
3097             *p++ = (unsigned char)*s++;
3098         }
3099         if (((s - bs) & 1) == 0 ||
3100             s >= end ||
3101             (*s != 'u' && *s != 'U')) {
3102             continue;
3103         }
3104         p--;
3105         count = *s=='u' ? 4 : 8;
3106         s++;
3107
3108         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3109         outpos = p-PyUnicode_AS_UNICODE(v);
3110         for (x = 0, i = 0; i < count; ++i, ++s) {
3111             c = (unsigned char)*s;
3112             if (!isxdigit(c)) {
3113                 endinpos = s-starts;
3114                 if (unicode_decode_call_errorhandler(
3115                     errors, &errorHandler,
3116                     "rawunicodeescape", "truncated \\uXXXX",
3117                     starts, size, &startinpos, &endinpos, &exc, &s,
3118                     &v, &outpos, &p))
3119                     goto onError;
3120                 goto nextByte;
3121             }
3122             x = (x<<4) & ~0xF;
3123             if (c >= '0' && c <= '9')
3124                 x += c - '0';
3125             else if (c >= 'a' && c <= 'f')
3126                 x += 10 + c - 'a';
3127             else
3128                 x += 10 + c - 'A';
3129         }
3130         if (x <= 0xffff)
3131                 /* UCS-2 character */
3132                 *p++ = (Py_UNICODE) x;
3133         else if (x <= 0x10ffff) {
3134                 /* UCS-4 character. Either store directly, or as
3135                    surrogate pair. */
3136 #ifdef Py_UNICODE_WIDE
3137                 *p++ = (Py_UNICODE) x;
3138 #else
3139                 x -= 0x10000L;
3140                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3141                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3142 #endif
3143         } else {
3144             endinpos = s-starts;
3145             outpos = p-PyUnicode_AS_UNICODE(v);
3146             if (unicode_decode_call_errorhandler(
3147                     errors, &errorHandler,
3148                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3149                     starts, size, &startinpos, &endinpos, &exc, &s,
3150                     &v, &outpos, &p))
3151                     goto onError;
3152         }
3153         nextByte:
3154         ;
3155     }
3156     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3157         goto onError;
3158     Py_XDECREF(errorHandler);
3159     Py_XDECREF(exc);
3160     return (PyObject *)v;
3161
3162  onError:
3163     Py_XDECREF(v);
3164     Py_XDECREF(errorHandler);
3165     Py_XDECREF(exc);
3166     return NULL;
3167 }
3168
3169 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3170                                            Py_ssize_t size)
3171 {
3172     PyObject *repr;
3173     char *p;
3174     char *q;
3175
3176     static const char *hexdigit = "0123456789abcdef";
3177 #ifdef Py_UNICODE_WIDE
3178     const Py_ssize_t expandsize = 10;
3179 #else
3180     const Py_ssize_t expandsize = 6;
3181 #endif
3182
3183     if (size > PY_SSIZE_T_MAX / expandsize)
3184         return PyErr_NoMemory();
3185
3186     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3187     if (repr == NULL)
3188         return NULL;
3189     if (size == 0)
3190         return repr;
3191
3192     p = q = PyString_AS_STRING(repr);
3193     while (size-- > 0) {
3194         Py_UNICODE ch = *s++;
3195 #ifdef Py_UNICODE_WIDE
3196         /* Map 32-bit characters to '\Uxxxxxxxx' */
3197         if (ch >= 0x10000) {
3198             *p++ = '\\';
3199             *p++ = 'U';
3200             *p++ = hexdigit[(ch >> 28) & 0xf];
3201             *p++ = hexdigit[(ch >> 24) & 0xf];
3202             *p++ = hexdigit[(ch >> 20) & 0xf];
3203             *p++ = hexdigit[(ch >> 16) & 0xf];
3204             *p++ = hexdigit[(ch >> 12) & 0xf];
3205             *p++ = hexdigit[(ch >> 8) & 0xf];
3206             *p++ = hexdigit[(ch >> 4) & 0xf];
3207             *p++ = hexdigit[ch & 15];
3208         }
3209         else
3210 #else
3211         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3212         if (ch >= 0xD800 && ch < 0xDC00) {
3213             Py_UNICODE ch2;
3214             Py_UCS4 ucs;
3215
3216             ch2 = *s++;
3217             size--;
3218             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3219                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3220                 *p++ = '\\';
3221                 *p++ = 'U';
3222                 *p++ = hexdigit[(ucs >> 28) & 0xf];
3223                 *p++ = hexdigit[(ucs >> 24) & 0xf];
3224                 *p++ = hexdigit[(ucs >> 20) & 0xf];
3225                 *p++ = hexdigit[(ucs >> 16) & 0xf];
3226                 *p++ = hexdigit[(ucs >> 12) & 0xf];
3227                 *p++ = hexdigit[(ucs >> 8) & 0xf];
3228                 *p++ = hexdigit[(ucs >> 4) & 0xf];
3229                 *p++ = hexdigit[ucs & 0xf];
3230                 continue;
3231             }
3232             /* Fall through: isolated surrogates are copied as-is */
3233             s--;
3234             size++;
3235         }
3236 #endif
3237         /* Map 16-bit characters to '\uxxxx' */
3238         if (ch >= 256) {
3239             *p++ = '\\';
3240             *p++ = 'u';
3241             *p++ = hexdigit[(ch >> 12) & 0xf];
3242             *p++ = hexdigit[(ch >> 8) & 0xf];
3243             *p++ = hexdigit[(ch >> 4) & 0xf];
3244             *p++ = hexdigit[ch & 15];
3245         }
3246         /* Copy everything else as-is */
3247         else
3248             *p++ = (char) ch;
3249     }
3250     *p = '\0';
3251     _PyString_Resize(&repr, p - q);
3252     return repr;
3253 }
3254
3255 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3256 {
3257     if (!PyUnicode_Check(unicode)) {
3258         PyErr_BadArgument();
3259         return NULL;
3260     }
3261     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3262                                             PyUnicode_GET_SIZE(unicode));
3263 }
3264
3265 /* --- Unicode Internal Codec ------------------------------------------- */
3266
3267 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3268                                            Py_ssize_t size,
3269                                            const char *errors)
3270 {
3271     const char *starts = s;
3272     Py_ssize_t startinpos;
3273     Py_ssize_t endinpos;
3274     Py_ssize_t outpos;
3275     PyUnicodeObject *v;
3276     Py_UNICODE *p;
3277     const char *end;
3278     const char *reason;
3279     PyObject *errorHandler = NULL;
3280     PyObject *exc = NULL;
3281
3282 #ifdef Py_UNICODE_WIDE
3283     Py_UNICODE unimax = PyUnicode_GetMax();
3284 #endif
3285
3286     /* XXX overflow detection missing */
3287     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3288     if (v == NULL)
3289         goto onError;
3290     if (PyUnicode_GetSize((PyObject *)v) == 0)
3291         return (PyObject *)v;
3292     p = PyUnicode_AS_UNICODE(v);
3293     end = s + size;
3294
3295     while (s < end) {
3296         memcpy(p, s, sizeof(Py_UNICODE));
3297         /* We have to sanity check the raw data, otherwise doom looms for
3298            some malformed UCS-4 data. */
3299         if (
3300             #ifdef Py_UNICODE_WIDE
3301             *p > unimax || *p < 0 ||
3302             #endif
3303             end-s < Py_UNICODE_SIZE
3304             )
3305             {
3306             startinpos = s - starts;
3307             if (end-s < Py_UNICODE_SIZE) {
3308                 endinpos = end-starts;
3309                 reason = "truncated input";
3310             }
3311             else {
3312                 endinpos = s - starts + Py_UNICODE_SIZE;
3313                 reason = "illegal code point (> 0x10FFFF)";
3314             }
3315             outpos = p - PyUnicode_AS_UNICODE(v);
3316             if (unicode_decode_call_errorhandler(
3317                     errors, &errorHandler,
3318                     "unicode_internal", reason,
3319                     starts, size, &startinpos, &endinpos, &exc, &s,
3320                     &v, &outpos, &p)) {
3321                 goto onError;
3322             }
3323         }
3324         else {
3325             p++;
3326             s += Py_UNICODE_SIZE;
3327         }
3328     }
3329
3330     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3331         goto onError;
3332     Py_XDECREF(errorHandler);
3333     Py_XDECREF(exc);
3334     return (PyObject *)v;
3335
3336  onError:
3337     Py_XDECREF(v);
3338     Py_XDECREF(errorHandler);
3339     Py_XDECREF(exc);
3340     return NULL;
3341 }
3342
3343 /* --- Latin-1 Codec ------------------------------------------------------ */
3344
3345 PyObject *PyUnicode_DecodeLatin1(const char *s,
3346                                  Py_ssize_t size,
3347                                  const char *errors)
3348 {
3349     PyUnicodeObject *v;
3350     Py_UNICODE *p;
3351
3352     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3353     if (size == 1) {
3354         Py_UNICODE r = *(unsigned char*)s;
3355         return PyUnicode_FromUnicode(&r, 1);
3356     }
3357
3358     v = _PyUnicode_New(size);
3359     if (v == NULL)
3360         goto onError;
3361     if (size == 0)
3362         return (PyObject *)v;
3363     p = PyUnicode_AS_UNICODE(v);
3364     while (size-- > 0)
3365         *p++ = (unsigned char)*s++;
3366     return (PyObject *)v;
3367
3368  onError:
3369     Py_XDECREF(v);
3370     return NULL;
3371 }
3372
3373 /* create or adjust a UnicodeEncodeError */
3374 static void make_encode_exception(PyObject **exceptionObject,
3375     const char *encoding,
3376     const Py_UNICODE *unicode, Py_ssize_t size,
3377     Py_ssize_t startpos, Py_ssize_t endpos,
3378     const char *reason)
3379 {
3380     if (*exceptionObject == NULL) {
3381         *exceptionObject = PyUnicodeEncodeError_Create(
3382             encoding, unicode, size, startpos, endpos, reason);
3383     }
3384     else {
3385         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3386             goto onError;
3387         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3388             goto onError;
3389         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3390             goto onError;
3391         return;
3392         onError:
3393         Py_DECREF(*exceptionObject);
3394         *exceptionObject = NULL;
3395     }
3396 }
3397
3398 /* raises a UnicodeEncodeError */
3399 static void raise_encode_exception(PyObject **exceptionObject,
3400     const char *encoding,
3401     const Py_UNICODE *unicode, Py_ssize_t size,
3402     Py_ssize_t startpos, Py_ssize_t endpos,
3403     const char *reason)
3404 {
3405     make_encode_exception(exceptionObject,
3406         encoding, unicode, size, startpos, endpos, reason);
3407     if (*exceptionObject != NULL)
3408         PyCodec_StrictErrors(*exceptionObject);
3409 }
3410
3411 /* error handling callback helper:
3412    build arguments, call the callback and check the arguments,
3413    put the result into newpos and return the replacement string, which
3414    has to be freed by the caller */
3415 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3416     PyObject **errorHandler,
3417     const char *encoding, const char *reason,
3418     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3419     Py_ssize_t startpos, Py_ssize_t endpos,
3420     Py_ssize_t *newpos)
3421 {
3422     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3423
3424     PyObject *restuple;
3425     PyObject *resunicode;
3426
3427     if (*errorHandler == NULL) {
3428         *errorHandler = PyCodec_LookupError(errors);
3429         if (*errorHandler == NULL)
3430             return NULL;
3431     }
3432
3433     make_encode_exception(exceptionObject,
3434         encoding, unicode, size, startpos, endpos, reason);
3435     if (*exceptionObject == NULL)
3436         return NULL;
3437
3438     restuple = PyObject_CallFunctionObjArgs(
3439         *errorHandler, *exceptionObject, NULL);
3440     if (restuple == NULL)
3441         return NULL;
3442     if (!PyTuple_Check(restuple)) {
3443         PyErr_Format(PyExc_TypeError, &argparse[4]);
3444         Py_DECREF(restuple);
3445         return NULL;
3446     }
3447     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3448         &resunicode, newpos)) {
3449         Py_DECREF(restuple);
3450         return NULL;
3451     }
3452     if (*newpos<0)
3453         *newpos = size+*newpos;
3454     if (*newpos<0 || *newpos>size) {
3455         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3456         Py_DECREF(restuple);
3457         return NULL;
3458     }
3459     Py_INCREF(resunicode);
3460     Py_DECREF(restuple);
3461     return resunicode;
3462 }
3463
3464 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3465                                  Py_ssize_t size,
3466                                  const char *errors,
3467                                  int limit)
3468 {
3469     /* output object */
3470     PyObject *res;
3471     /* pointers to the beginning and end+1 of input */
3472     const Py_UNICODE *startp = p;
3473     const Py_UNICODE *endp = p + size;
3474     /* pointer to the beginning of the unencodable characters */
3475     /* const Py_UNICODE *badp = NULL; */
3476     /* pointer into the output */
3477     char *str;
3478     /* current output position */
3479     Py_ssize_t respos = 0;
3480     Py_ssize_t ressize;
3481     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3482     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3483     PyObject *errorHandler = NULL;
3484     PyObject *exc = NULL;
3485     /* the following variable is used for caching string comparisons
3486      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3487     int known_errorHandler = -1;
3488
3489     /* allocate enough for a simple encoding without
3490        replacements, if we need more, we'll resize */
3491     res = PyString_FromStringAndSize(NULL, size);
3492     if (res == NULL)
3493         goto onError;
3494     if (size == 0)
3495         return res;
3496     str = PyString_AS_STRING(res);
3497     ressize = size;
3498
3499     while (p<endp) {
3500         Py_UNICODE c = *p;
3501
3502         /* can we encode this? */
3503         if (c<limit) {
3504             /* no overflow check, because we know that the space is enough */
3505             *str++ = (char)c;
3506             ++p;
3507         }
3508         else {
3509             Py_ssize_t unicodepos = p-startp;
3510             Py_ssize_t requiredsize;
3511             PyObject *repunicode;
3512             Py_ssize_t repsize;
3513             Py_ssize_t newpos;
3514             Py_ssize_t respos;
3515             Py_UNICODE *uni2;
3516             /* startpos for collecting unencodable chars */
3517             const Py_UNICODE *collstart = p;
3518             const Py_UNICODE *collend = p;
3519             /* find all unecodable characters */
3520             while ((collend < endp) && ((*collend)>=limit))
3521                 ++collend;
3522             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3523             if (known_errorHandler==-1) {
3524                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3525                     known_errorHandler = 1;
3526                 else if (!strcmp(errors, "replace"))
3527                     known_errorHandler = 2;
3528                 else if (!strcmp(errors, "ignore"))
3529                     known_errorHandler = 3;
3530                 else if (!strcmp(errors, "xmlcharrefreplace"))
3531                     known_errorHandler = 4;
3532                 else
3533                     known_errorHandler = 0;
3534             }
3535             switch (known_errorHandler) {
3536                 case 1: /* strict */
3537                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3538                     goto onError;
3539                 case 2: /* replace */
3540                     while (collstart++<collend)
3541                         *str++ = '?'; /* fall through */
3542                 case 3: /* ignore */
3543                     p = collend;
3544                     break;
3545                 case 4: /* xmlcharrefreplace */
3546                     respos = str-PyString_AS_STRING(res);
3547                     /* determine replacement size (temporarily (mis)uses p) */
3548                     for (p = collstart, repsize = 0; p < collend; ++p) {
3549                         if (*p<10)
3550                             repsize += 2+1+1;
3551                         else if (*p<100)
3552                             repsize += 2+2+1;
3553                         else if (*p<1000)
3554                             repsize += 2+3+1;
3555                         else if (*p<10000)
3556                             repsize += 2+4+1;
3557 #ifndef Py_UNICODE_WIDE
3558                         else
3559                             repsize += 2+5+1;
3560 #else
3561                         else if (*p<100000)
3562                             repsize += 2+5+1;
3563                         else if (*p<1000000)
3564                             repsize += 2+6+1;
3565                         else
3566                             repsize += 2+7+1;
3567 #endif
3568                     }
3569                     requiredsize = respos+repsize+(endp-collend);
3570                     if (requiredsize > ressize) {
3571                         if (requiredsize<2*ressize)
3572                             requiredsize = 2*ressize;
3573                         if (_PyString_Resize(&res, requiredsize))
3574                             goto onError;
3575                         str = PyString_AS_STRING(res) + respos;
3576                         ressize = requiredsize;
3577                     }
3578                     /* generate replacement (temporarily (mis)uses p) */
3579                     for (p = collstart; p < collend; ++p) {
3580                         str += sprintf(str, "&#%d;", (int)*p);
3581                     }
3582                     p = collend;
3583                     break;
3584                 default:
3585                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3586                         encoding, reason, startp, size, &exc,
3587                         collstart-startp, collend-startp, &newpos);
3588                     if (repunicode == NULL)
3589                         goto onError;
3590                     /* need more space? (at least enough for what we
3591                        have+the replacement+the rest of the string, so
3592                        we won't have to check space for encodable characters) */
3593                     respos = str-PyString_AS_STRING(res);
3594                     repsize = PyUnicode_GET_SIZE(repunicode);
3595                     requiredsize = respos+repsize+(endp-collend);
3596                     if (requiredsize > ressize) {
3597                         if (requiredsize<2*ressize)
3598                             requiredsize = 2*ressize;
3599                         if (_PyString_Resize(&res, requiredsize)) {
3600                             Py_DECREF(repunicode);
3601                             goto onError;
3602                         }
3603                         str = PyString_AS_STRING(res) + respos;
3604                         ressize = requiredsize;
3605                     }
3606                     /* check if there is anything unencodable in the replacement
3607                        and copy it to the output */
3608                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3609                         c = *uni2;
3610                         if (c >= limit) {
3611                             raise_encode_exception(&exc, encoding, startp, size,
3612                                 unicodepos, unicodepos+1, reason);
3613                             Py_DECREF(repunicode);
3614                             goto onError;
3615                         }
3616                         *str = (char)c;
3617                     }
3618                     p = startp + newpos;
3619                     Py_DECREF(repunicode);
3620             }
3621         }
3622     }
3623     /* Resize if we allocated to much */
3624     respos = str-PyString_AS_STRING(res);
3625     if (respos<ressize)
3626        /* If this falls res will be NULL */
3627         _PyString_Resize(&res, respos);
3628     Py_XDECREF(errorHandler);
3629     Py_XDECREF(exc);
3630     return res;
3631
3632     onError:
3633     Py_XDECREF(res);
3634     Py_XDECREF(errorHandler);
3635     Py_XDECREF(exc);
3636     return NULL;
3637 }
3638
3639 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3640                                  Py_ssize_t size,
3641                                  const char *errors)
3642 {
3643     return unicode_encode_ucs1(p, size, errors, 256);
3644 }
3645
3646 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3647 {
3648     if (!PyUnicode_Check(unicode)) {
3649         PyErr_BadArgument();
3650         return NULL;
3651     }
3652     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3653                                   PyUnicode_GET_SIZE(unicode),
3654                                   NULL);
3655 }
3656
3657 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3658
3659 PyObject *PyUnicode_DecodeASCII(const char *s,
3660                                 Py_ssize_t size,
3661                                 const char *errors)
3662 {
3663     const char *starts = s;
3664     PyUnicodeObject *v;
3665     Py_UNICODE *p;
3666     Py_ssize_t startinpos;
3667     Py_ssize_t endinpos;
3668     Py_ssize_t outpos;
3669     const char *e;
3670     PyObject *errorHandler = NULL;
3671     PyObject *exc = NULL;
3672
3673     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3674     if (size == 1 && *(unsigned char*)s < 128) {
3675         Py_UNICODE r = *(unsigned char*)s;
3676         return PyUnicode_FromUnicode(&r, 1);
3677     }
3678
3679     v = _PyUnicode_New(size);
3680     if (v == NULL)
3681         goto onError;
3682     if (size == 0)
3683         return (PyObject *)v;
3684     p = PyUnicode_AS_UNICODE(v);
3685     e = s + size;
3686     while (s < e) {
3687         register unsigned char c = (unsigned char)*s;
3688         if (c < 128) {
3689             *p++ = c;
3690             ++s;
3691         }
3692         else {
3693             startinpos = s-starts;
3694             endinpos = startinpos + 1;
3695             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3696             if (unicode_decode_call_errorhandler(
3697                  errors, &errorHandler,
3698                  "ascii", "ordinal not in range(128)",
3699                  starts, size, &startinpos, &endinpos, &exc, &s,
3700                  &v, &outpos, &p))
3701                 goto onError;
3702         }
3703     }
3704     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3705         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3706             goto onError;
3707     Py_XDECREF(errorHandler);
3708     Py_XDECREF(exc);
3709     return (PyObject *)v;
3710
3711  onError:
3712     Py_XDECREF(v);
3713     Py_XDECREF(errorHandler);
3714     Py_XDECREF(exc);
3715     return NULL;
3716 }
3717
3718 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3719                                 Py_ssize_t size,
3720                                 const char *errors)
3721 {
3722     return unicode_encode_ucs1(p, size, errors, 128);
3723 }
3724
3725 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3726 {
3727     if (!PyUnicode_Check(unicode)) {
3728         PyErr_BadArgument();
3729         return NULL;
3730     }
3731     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3732                                  PyUnicode_GET_SIZE(unicode),
3733                                  NULL);
3734 }
3735
3736 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3737
3738 /* --- MBCS codecs for Windows -------------------------------------------- */
3739
3740 #if SIZEOF_INT < SIZEOF_SSIZE_T
3741 #define NEED_RETRY
3742 #endif
3743
3744 /* XXX This code is limited to "true" double-byte encodings, as
3745    a) it assumes an incomplete character consists of a single byte, and
3746    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3747       encodings, see IsDBCSLeadByteEx documentation. */
3748
3749 static int is_dbcs_lead_byte(const char *s, int offset)
3750 {
3751     const char *curr = s + offset;
3752
3753     if (IsDBCSLeadByte(*curr)) {
3754         const char *prev = CharPrev(s, curr);
3755         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3756     }
3757     return 0;
3758 }
3759
3760 /*
3761  * Decode MBCS string into unicode object. If 'final' is set, converts
3762  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3763  */
3764 static int decode_mbcs(PyUnicodeObject **v,
3765                         const char *s, /* MBCS string */
3766                         int size, /* sizeof MBCS string */
3767                         int final)
3768 {
3769     Py_UNICODE *p;
3770     Py_ssize_t n = 0;
3771     int usize = 0;
3772
3773     assert(size >= 0);
3774
3775     /* Skip trailing lead-byte unless 'final' is set */
3776     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3777         --size;
3778
3779     /* First get the size of the result */
3780     if (size > 0) {
3781         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3782         if (usize == 0) {
3783             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3784             return -1;
3785         }
3786     }
3787
3788     if (*v == NULL) {
3789         /* Create unicode object */
3790         *v = _PyUnicode_New(usize);
3791         if (*v == NULL)
3792             return -1;
3793     }
3794     else {
3795         /* Extend unicode object */
3796         n = PyUnicode_GET_SIZE(*v);
3797         if (_PyUnicode_Resize(v, n + usize) < 0)
3798             return -1;
3799     }
3800
3801     /* Do the conversion */
3802     if (size > 0) {
3803         p = PyUnicode_AS_UNICODE(*v) + n;
3804         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3805             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3806             return -1;
3807         }
3808     }
3809
3810     return size;
3811 }
3812
3813 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3814                                         Py_ssize_t size,
3815                                         const char *errors,
3816                                         Py_ssize_t *consumed)
3817 {
3818     PyUnicodeObject *v = NULL;
3819     int done;
3820
3821     if (consumed)
3822         *consumed = 0;
3823
3824 #ifdef NEED_RETRY
3825   retry:
3826     if (size > INT_MAX)
3827         done = decode_mbcs(&v, s, INT_MAX, 0);
3828     else
3829 #endif
3830         done = decode_mbcs(&v, s, (int)size, !consumed);
3831
3832     if (done < 0) {
3833         Py_XDECREF(v);
3834         return NULL;
3835     }
3836
3837     if (consumed)
3838         *consumed += done;
3839
3840 #ifdef NEED_RETRY
3841     if (size > INT_MAX) {
3842         s += done;
3843         size -= done;
3844         goto retry;
3845     }
3846 #endif
3847
3848     return (PyObject *)v;
3849 }
3850
3851 PyObject *PyUnicode_DecodeMBCS(const char *s,
3852                                 Py_ssize_t size,
3853                                 const char *errors)
3854 {
3855     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3856 }
3857
3858 /*
3859  * Convert unicode into string object (MBCS).
3860  * Returns 0 if succeed, -1 otherwise.
3861  */
3862 static int encode_mbcs(PyObject **repr,
3863                         const Py_UNICODE *p, /* unicode */
3864                         int size) /* size of unicode */
3865 {
3866     int mbcssize = 0;
3867     Py_ssize_t n = 0;
3868
3869     assert(size >= 0);
3870
3871     /* First get the size of the result */
3872     if (size > 0) {
3873         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3874         if (mbcssize == 0) {
3875             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3876             return -1;
3877         }
3878     }
3879
3880     if (*repr == NULL) {
3881         /* Create string object */
3882         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3883         if (*repr == NULL)
3884             return -1;
3885     }
3886     else {
3887         /* Extend string object */
3888         n = PyString_Size(*repr);
3889         if (_PyString_Resize(repr, n + mbcssize) < 0)
3890             return -1;
3891     }
3892
3893     /* Do the conversion */
3894     if (size > 0) {
3895         char *s = PyString_AS_STRING(*repr) + n;
3896         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3897             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3898             return -1;
3899         }
3900     }
3901
3902     return 0;
3903 }
3904
3905 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3906                                 Py_ssize_t size,
3907                                 const char *errors)
3908 {
3909     PyObject *repr = NULL;
3910     int ret;
3911
3912 #ifdef NEED_RETRY
3913  retry:
3914     if (size > INT_MAX)
3915         ret = encode_mbcs(&repr, p, INT_MAX);
3916     else
3917 #endif
3918         ret = encode_mbcs(&repr, p, (int)size);
3919
3920     if (ret < 0) {
3921         Py_XDECREF(repr);
3922         return NULL;
3923     }
3924
3925 #ifdef NEED_RETRY
3926     if (size > INT_MAX) {
3927         p += INT_MAX;
3928         size -= INT_MAX;
3929         goto retry;
3930     }
3931 #endif
3932
3933     return repr;
3934 }
3935
3936 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3937 {
3938     if (!PyUnicode_Check(unicode)) {
3939         PyErr_BadArgument();
3940         return NULL;
3941     }
3942     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3943                                 PyUnicode_GET_SIZE(unicode),
3944                                 NULL);
3945 }
3946
3947 #undef NEED_RETRY
3948
3949 #endif /* MS_WINDOWS */
3950
3951 /* --- Character Mapping Codec -------------------------------------------- */
3952
3953 PyObject *PyUnicode_DecodeCharmap(const char *s,
3954                                   Py_ssize_t size,
3955                                   PyObject *mapping,
3956                                   const char *errors)
3957 {
3958     const char *starts = s;
3959     Py_ssize_t startinpos;
3960     Py_ssize_t endinpos;
3961     Py_ssize_t outpos;
3962     const char *e;
3963     PyUnicodeObject *v;
3964     Py_UNICODE *p;
3965     Py_ssize_t extrachars = 0;
3966     PyObject *errorHandler = NULL;
3967     PyObject *exc = NULL;
3968     Py_UNICODE *mapstring = NULL;
3969     Py_ssize_t maplen = 0;
3970
3971     /* Default to Latin-1 */
3972     if (mapping == NULL)
3973         return PyUnicode_DecodeLatin1(s, size, errors);
3974
3975     v = _PyUnicode_New(size);
3976     if (v == NULL)
3977         goto onError;
3978     if (size == 0)
3979         return (PyObject *)v;
3980     p = PyUnicode_AS_UNICODE(v);
3981     e = s + size;
3982     if (PyUnicode_CheckExact(mapping)) {
3983         mapstring = PyUnicode_AS_UNICODE(mapping);
3984         maplen = PyUnicode_GET_SIZE(mapping);
3985         while (s < e) {
3986             unsigned char ch = *s;
3987             Py_UNICODE x = 0xfffe; /* illegal value */
3988
3989             if (ch < maplen)
3990                 x = mapstring[ch];
3991
3992             if (x == 0xfffe) {
3993                 /* undefined mapping */
3994                 outpos = p-PyUnicode_AS_UNICODE(v);
3995                 startinpos = s-starts;
3996                 endinpos = startinpos+1;
3997                 if (unicode_decode_call_errorhandler(
3998                      errors, &errorHandler,
3999                      "charmap", "character maps to <undefined>",
4000                      starts, size, &startinpos, &endinpos, &exc, &s,
4001                      &v, &outpos, &p)) {
4002                     goto onError;
4003                 }
4004                 continue;
4005             }
4006             *p++ = x;
4007             ++s;
4008         }
4009     }
4010     else {
4011         while (s < e) {
4012             unsigned char ch = *s;
4013             PyObject *w, *x;
4014
4015             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4016             w = PyInt_FromLong((long)ch);
4017             if (w == NULL)
4018                 goto onError;
4019             x = PyObject_GetItem(mapping, w);
4020             Py_DECREF(w);
4021             if (x == NULL) {
4022                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4023                     /* No mapping found means: mapping is undefined. */
4024                     PyErr_Clear();
4025                     x = Py_None;
4026                     Py_INCREF(x);
4027                 } else
4028                     goto onError;
4029             }
4030
4031             /* Apply mapping */
4032             if (PyInt_Check(x)) {
4033                 long value = PyInt_AS_LONG(x);
4034                 if (value < 0 || value > 65535) {
4035                     PyErr_SetString(PyExc_TypeError,
4036                                     "character mapping must be in range(65536)");
4037                     Py_DECREF(x);
4038                     goto onError;
4039                 }
4040                 *p++ = (Py_UNICODE)value;
4041             }
4042             else if (x == Py_None) {
4043                 /* undefined mapping */
4044                 outpos = p-PyUnicode_AS_UNICODE(v);
4045                 startinpos = s-starts;
4046                 endinpos = startinpos+1;
4047                 if (unicode_decode_call_errorhandler(
4048                      errors, &errorHandler,
4049                      "charmap", "character maps to <undefined>",
4050                      starts, size, &startinpos, &endinpos, &exc, &s,
4051                      &v, &outpos, &p)) {
4052                     Py_DECREF(x);
4053                     goto onError;
4054                 }
4055                 Py_DECREF(x);
4056                 continue;
4057             }
4058             else if (PyUnicode_Check(x)) {
4059                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4060
4061                 if (targetsize == 1)
4062                     /* 1-1 mapping */
4063                     *p++ = *PyUnicode_AS_UNICODE(x);
4064
4065                 else if (targetsize > 1) {
4066                     /* 1-n mapping */
4067                     if (targetsize > extrachars) {
4068                         /* resize first */
4069                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4070                         Py_ssize_t needed = (targetsize - extrachars) + \
4071                                      (targetsize << 2);
4072                         extrachars += needed;
4073                         /* XXX overflow detection missing */
4074                         if (_PyUnicode_Resize(&v,
4075                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4076                             Py_DECREF(x);
4077                             goto onError;
4078                         }
4079                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4080                     }
4081                     Py_UNICODE_COPY(p,
4082                                     PyUnicode_AS_UNICODE(x),
4083                                     targetsize);
4084                     p += targetsize;
4085                     extrachars -= targetsize;
4086                 }
4087                 /* 1-0 mapping: skip the character */
4088             }
4089             else {
4090                 /* wrong return value */
4091                 PyErr_SetString(PyExc_TypeError,
4092                       "character mapping must return integer, None or unicode");
4093                 Py_DECREF(x);
4094                 goto onError;
4095             }
4096             Py_DECREF(x);
4097             ++s;
4098         }
4099     }
4100     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4101         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4102             goto onError;
4103     Py_XDECREF(errorHandler);
4104     Py_XDECREF(exc);
4105     return (PyObject *)v;
4106
4107  onError:
4108     Py_XDECREF(errorHandler);
4109     Py_XDECREF(exc);
4110     Py_XDECREF(v);
4111     return NULL;
4112 }
4113
4114 /* Charmap encoding: the lookup table */
4115
4116 struct encoding_map{
4117   PyObject_HEAD
4118   unsigned char level1[32];
4119   int count2, count3;
4120   unsigned char level23[1];
4121 };
4122
4123 static PyObject*
4124 encoding_map_size(PyObject *obj, PyObject* args)
4125 {
4126     struct encoding_map *map = (struct encoding_map*)obj;
4127     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4128                           128*map->count3);
4129 }
4130
4131 static PyMethodDef encoding_map_methods[] = {
4132         {"size", encoding_map_size, METH_NOARGS,
4133          PyDoc_STR("Return the size (in bytes) of this object") },
4134         { 0 }
4135 };
4136
4137 static void
4138 encoding_map_dealloc(PyObject* o)
4139 {
4140         PyObject_FREE(o);
4141 }
4142
4143 static PyTypeObject EncodingMapType = {
4144         PyVarObject_HEAD_INIT(NULL, 0)
4145         "EncodingMap",          /*tp_name*/
4146         sizeof(struct encoding_map),   /*tp_basicsize*/
4147         0,                      /*tp_itemsize*/
4148         /* methods */
4149         encoding_map_dealloc,   /*tp_dealloc*/
4150         0,                      /*tp_print*/
4151         0,                      /*tp_getattr*/
4152         0,                      /*tp_setattr*/
4153         0,                      /*tp_compare*/
4154         0,                      /*tp_repr*/
4155         0,                      /*tp_as_number*/
4156         0,                      /*tp_as_sequence*/
4157         0,                      /*tp_as_mapping*/
4158         0,                      /*tp_hash*/
4159         0,                      /*tp_call*/
4160         0,                      /*tp_str*/
4161         0,                      /*tp_getattro*/
4162         0,                      /*tp_setattro*/
4163         0,                      /*tp_as_buffer*/
4164         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4165         0,                      /*tp_doc*/
4166         0,                      /*tp_traverse*/
4167         0,                      /*tp_clear*/
4168         0,                      /*tp_richcompare*/
4169         0,                      /*tp_weaklistoffset*/
4170         0,                      /*tp_iter*/
4171         0,                      /*tp_iternext*/
4172         encoding_map_methods,   /*tp_methods*/
4173         0,                      /*tp_members*/
4174         0,                      /*tp_getset*/
4175         0,                      /*tp_base*/
4176         0,                      /*tp_dict*/
4177         0,                      /*tp_descr_get*/
4178         0,                      /*tp_descr_set*/
4179         0,                      /*tp_dictoffset*/
4180         0,                      /*tp_init*/
4181         0,                      /*tp_alloc*/
4182         0,                      /*tp_new*/
4183         0,                      /*tp_free*/
4184         0,                      /*tp_is_gc*/
4185 };
4186
4187 PyObject*
4188 PyUnicode_BuildEncodingMap(PyObject* string)
4189 {
4190     Py_UNICODE *decode;
4191     PyObject *result;
4192     struct encoding_map *mresult;
4193     int i;
4194     int need_dict = 0;
4195     unsigned char level1[32];
4196     unsigned char level2[512];
4197     unsigned char *mlevel1, *mlevel2, *mlevel3;
4198     int count2 = 0, count3 = 0;
4199
4200     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4201         PyErr_BadArgument();
4202         return NULL;
4203     }
4204     decode = PyUnicode_AS_UNICODE(string);
4205     memset(level1, 0xFF, sizeof level1);
4206     memset(level2, 0xFF, sizeof level2);
4207
4208     /* If there isn't a one-to-one mapping of NULL to \0,
4209        or if there are non-BMP characters, we need to use
4210        a mapping dictionary. */
4211     if (decode[0] != 0)
4212         need_dict = 1;
4213     for (i = 1; i < 256; i++) {
4214         int l1, l2;
4215         if (decode[i] == 0
4216             #ifdef Py_UNICODE_WIDE
4217             || decode[i] > 0xFFFF
4218             #endif
4219         ) {
4220             need_dict = 1;
4221             break;
4222         }
4223         if (decode[i] == 0xFFFE)
4224             /* unmapped character */
4225             continue;
4226         l1 = decode[i] >> 11;
4227         l2 = decode[i] >> 7;
4228         if (level1[l1] == 0xFF)
4229             level1[l1] = count2++;
4230         if (level2[l2] == 0xFF)
4231             level2[l2] = count3++;
4232     }
4233
4234     if (count2 >= 0xFF || count3 >= 0xFF)
4235         need_dict = 1;
4236
4237     if (need_dict) {
4238         PyObject *result = PyDict_New();
4239         PyObject *key, *value;
4240         if (!result)
4241             return NULL;
4242         for (i = 0; i < 256; i++) {
4243             key = value = NULL;
4244             key = PyInt_FromLong(decode[i]);
4245             value = PyInt_FromLong(i);
4246             if (!key || !value)
4247                 goto failed1;
4248             if (PyDict_SetItem(result, key, value) == -1)
4249                 goto failed1;
4250             Py_DECREF(key);
4251             Py_DECREF(value);
4252         }
4253         return result;
4254       failed1:
4255         Py_XDECREF(key);
4256         Py_XDECREF(value);
4257         Py_DECREF(result);
4258         return NULL;
4259     }
4260
4261     /* Create a three-level trie */
4262     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4263                              16*count2 + 128*count3 - 1);
4264     if (!result)
4265         return PyErr_NoMemory();
4266     PyObject_Init(result, &EncodingMapType);
4267     mresult = (struct encoding_map*)result;
4268     mresult->count2 = count2;
4269     mresult->count3 = count3;
4270     mlevel1 = mresult->level1;
4271     mlevel2 = mresult->level23;
4272     mlevel3 = mresult->level23 + 16*count2;
4273     memcpy(mlevel1, level1, 32);
4274     memset(mlevel2, 0xFF, 16*count2);
4275     memset(mlevel3, 0, 128*count3);
4276     count3 = 0;
4277     for (i = 1; i < 256; i++) {
4278         int o1, o2, o3, i2, i3;
4279         if (decode[i] == 0xFFFE)
4280             /* unmapped character */
4281             continue;
4282         o1 = decode[i]>>11;
4283         o2 = (decode[i]>>7) & 0xF;
4284         i2 = 16*mlevel1[o1] + o2;
4285         if (mlevel2[i2] == 0xFF)
4286             mlevel2[i2] = count3++;
4287         o3 = decode[i] & 0x7F;
4288         i3 = 128*mlevel2[i2] + o3;
4289         mlevel3[i3] = i;
4290     }
4291     return result;
4292 }
4293
4294 static int
4295 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4296 {
4297     struct encoding_map *map = (struct encoding_map*)mapping;
4298     int l1 = c>>11;
4299     int l2 = (c>>7) & 0xF;
4300     int l3 = c & 0x7F;
4301     int i;
4302
4303 #ifdef Py_UNICODE_WIDE
4304     if (c > 0xFFFF) {
4305         return -1;
4306     }
4307 #endif
4308     if (c == 0)
4309         return 0;
4310     /* level 1*/
4311     i = map->level1[l1];
4312     if (i == 0xFF) {
4313         return -1;
4314     }
4315     /* level 2*/
4316     i = map->level23[16*i+l2];
4317     if (i == 0xFF) {
4318         return -1;
4319     }
4320     /* level 3 */
4321     i = map->level23[16*map->count2 + 128*i + l3];
4322     if (i == 0) {
4323         return -1;
4324     }
4325     return i;
4326 }
4327
4328 /* Lookup the character ch in the mapping. If the character
4329    can't be found, Py_None is returned (or NULL, if another
4330    error occurred). */
4331 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4332 {
4333     PyObject *w = PyInt_FromLong((long)c);
4334     PyObject *x;
4335
4336     if (w == NULL)
4337          return NULL;
4338     x = PyObject_GetItem(mapping, w);
4339     Py_DECREF(w);
4340     if (x == NULL) {
4341         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4342             /* No mapping found means: mapping is undefined. */
4343             PyErr_Clear();
4344             x = Py_None;
4345             Py_INCREF(x);
4346             return x;
4347         } else
4348             return NULL;
4349     }
4350     else if (x == Py_None)
4351         return x;
4352     else if (PyInt_Check(x)) {
4353         long value = PyInt_AS_LONG(x);
4354         if (value < 0 || value > 255) {
4355             PyErr_SetString(PyExc_TypeError,
4356                              "character mapping must be in range(256)");
4357             Py_DECREF(x);
4358             return NULL;
4359         }
4360         return x;
4361     }
4362     else if (PyString_Check(x))
4363         return x;
4364     else {
4365         /* wrong return value */
4366         PyErr_SetString(PyExc_TypeError,
4367               "character mapping must return integer, None or str");
4368         Py_DECREF(x);
4369         return NULL;
4370     }
4371 }
4372
4373 static int
4374 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4375 {
4376         Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4377         /* exponentially overallocate to minimize reallocations */
4378         if (requiredsize < 2*outsize)
4379             requiredsize = 2*outsize;
4380         if (_PyString_Resize(outobj, requiredsize)) {
4381             return 0;
4382         }
4383         return 1;
4384 }
4385
4386 typedef enum charmapencode_result {
4387   enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4388 }charmapencode_result;
4389 /* lookup the character, put the result in the output string and adjust
4390    various state variables. Reallocate the output string if not enough
4391    space is available. Return a new reference to the object that
4392    was put in the output buffer, or Py_None, if the mapping was undefined
4393    (in which case no character was written) or NULL, if a
4394    reallocation error occurred. The caller must decref the result */
4395 static
4396 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4397     PyObject **outobj, Py_ssize_t *outpos)
4398 {
4399     PyObject *rep;
4400     char *outstart;
4401     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4402
4403     if (Py_TYPE(mapping) == &EncodingMapType) {
4404         int res = encoding_map_lookup(c, mapping);
4405         Py_ssize_t requiredsize = *outpos+1;
4406         if (res == -1)
4407             return enc_FAILED;
4408         if (outsize<requiredsize)
4409             if (!charmapencode_resize(outobj, outpos, requiredsize))
4410                 return enc_EXCEPTION;
4411         outstart = PyString_AS_STRING(*outobj);
4412         outstart[(*outpos)++] = (char)res;
4413         return enc_SUCCESS;
4414     }
4415
4416     rep = charmapencode_lookup(c, mapping);
4417     if (rep==NULL)
4418         return enc_EXCEPTION;
4419     else if (rep==Py_None) {
4420         Py_DECREF(rep);
4421         return enc_FAILED;
4422     } else {
4423         if (PyInt_Check(rep)) {
4424             Py_ssize_t requiredsize = *outpos+1;
4425             if (outsize<requiredsize)
4426                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4427                     Py_DECREF(rep);
4428                     return enc_EXCEPTION;
4429                 }
4430             outstart = PyString_AS_STRING(*outobj);
4431             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4432         }
4433         else {
4434             const char *repchars = PyString_AS_STRING(rep);
4435             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4436             Py_ssize_t requiredsize = *outpos+repsize;
4437             if (outsize<requiredsize)
4438                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4439                     Py_DECREF(rep);
4440                     return enc_EXCEPTION;
4441                 }
4442             outstart = PyString_AS_STRING(*outobj);
4443             memcpy(outstart + *outpos, repchars, repsize);
4444             *outpos += repsize;
4445         }
4446     }
4447     Py_DECREF(rep);
4448     return enc_SUCCESS;
4449 }
4450
4451 /* handle an error in PyUnicode_EncodeCharmap
4452    Return 0 on success, -1 on error */
4453 static
4454 int charmap_encoding_error(
4455     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4456     PyObject **exceptionObject,
4457     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4458     PyObject **res, Py_ssize_t *respos)
4459 {
4460     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4461     Py_ssize_t repsize;
4462     Py_ssize_t newpos;
4463     Py_UNICODE *uni2;
4464     /* startpos for collecting unencodable chars */
4465     Py_ssize_t collstartpos = *inpos;
4466     Py_ssize_t collendpos = *inpos+1;
4467     Py_ssize_t collpos;
4468     char *encoding = "charmap";
4469     char *reason = "character maps to <undefined>";
4470     charmapencode_result x;
4471
4472     /* find all unencodable characters */
4473     while (collendpos < size) {
4474         PyObject *rep;
4475         if (Py_TYPE(mapping) == &EncodingMapType) {
4476             int res = encoding_map_lookup(p[collendpos], mapping);
4477             if (res != -1)
4478                 break;
4479             ++collendpos;
4480             continue;
4481         }
4482
4483         rep = charmapencode_lookup(p[collendpos], mapping);
4484         if (rep==NULL)
4485             return -1;
4486         else if (rep!=Py_None) {
4487             Py_DECREF(rep);
4488             break;
4489         }
4490         Py_DECREF(rep);
4491         ++collendpos;
4492     }
4493     /* cache callback name lookup
4494      * (if not done yet, i.e. it's the first error) */
4495     if (*known_errorHandler==-1) {
4496         if ((errors==NULL) || (!strcmp(errors, "strict")))
4497             *known_errorHandler = 1;
4498         else if (!strcmp(errors, "replace"))
4499             *known_errorHandler = 2;
4500         else if (!strcmp(errors, "ignore"))
4501             *known_errorHandler = 3;
4502         else if (!strcmp(errors, "xmlcharrefreplace"))
4503             *known_errorHandler = 4;
4504         else
4505             *known_errorHandler = 0;
4506     }
4507     switch (*known_errorHandler) {
4508         case 1: /* strict */
4509             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4510             return -1;
4511         case 2: /* replace */
4512             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4513                 x = charmapencode_output('?', mapping, res, respos);
4514                 if (x==enc_EXCEPTION) {
4515                     return -1;
4516                 }
4517                 else if (x==enc_FAILED) {
4518                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4519                     return -1;
4520                 }
4521             }
4522             /* fall through */
4523         case 3: /* ignore */
4524             *inpos = collendpos;
4525             break;
4526         case 4: /* xmlcharrefreplace */
4527             /* generate replacement (temporarily (mis)uses p) */
4528             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4529                 char buffer[2+29+1+1];
4530                 char *cp;
4531                 sprintf(buffer, "&#%d;", (int)p[collpos]);
4532                 for (cp = buffer; *cp; ++cp) {
4533                     x = charmapencode_output(*cp, mapping, res, respos);
4534                     if (x==enc_EXCEPTION)
4535                         return -1;
4536                     else if (x==enc_FAILED) {
4537                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4538                         return -1;
4539                     }
4540                 }
4541             }
4542             *inpos = collendpos;
4543             break;
4544         default:
4545             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4546                 encoding, reason, p, size, exceptionObject,
4547                 collstartpos, collendpos, &newpos);
4548             if (repunicode == NULL)
4549                 return -1;
4550             /* generate replacement  */
4551             repsize = PyUnicode_GET_SIZE(repunicode);
4552             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4553                 x = charmapencode_output(*uni2, mapping, res, respos);
4554                 if (x==enc_EXCEPTION) {
4555                     return -1;
4556                 }
4557                 else if (x==enc_FAILED) {
4558                     Py_DECREF(repunicode);
4559                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4560                     return -1;
4561                 }
4562             }
4563             *inpos = newpos;
4564             Py_DECREF(repunicode);
4565     }
4566     return 0;
4567 }
4568
4569 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4570                                   Py_ssize_t size,
4571                                   PyObject *mapping,
4572                                   const char *errors)
4573 {
4574     /* output object */
4575     PyObject *res = NULL;
4576     /* current input position */
4577     Py_ssize_t inpos = 0;
4578     /* current output position */
4579     Py_ssize_t respos = 0;
4580     PyObject *errorHandler = NULL;
4581     PyObject *exc = NULL;
4582     /* the following variable is used for caching string comparisons
4583      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4584      * 3=ignore, 4=xmlcharrefreplace */
4585     int known_errorHandler = -1;
4586
4587     /* Default to Latin-1 */
4588     if (mapping == NULL)
4589         return PyUnicode_EncodeLatin1(p, size, errors);
4590
4591     /* allocate enough for a simple encoding without
4592        replacements, if we need more, we'll resize */
4593     res = PyString_FromStringAndSize(NULL, size);
4594     if (res == NULL)
4595         goto onError;
4596     if (size == 0)
4597         return res;
4598
4599     while (inpos<size) {
4600         /* try to encode it */
4601         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4602         if (x==enc_EXCEPTION) /* error */
4603             goto onError;
4604         if (x==enc_FAILED) { /* unencodable character */
4605             if (charmap_encoding_error(p, size, &inpos, mapping,
4606                 &exc,
4607                 &known_errorHandler, &errorHandler, errors,
4608                 &res, &respos)) {
4609                 goto onError;
4610             }
4611         }
4612         else
4613             /* done with this character => adjust input position */
4614             ++inpos;
4615     }
4616
4617     /* Resize if we allocated to much */
4618     if (respos<PyString_GET_SIZE(res)) {
4619         if (_PyString_Resize(&res, respos))
4620             goto onError;
4621     }
4622     Py_XDECREF(exc);
4623     Py_XDECREF(errorHandler);
4624     return res;
4625
4626     onError:
4627     Py_XDECREF(res);
4628     Py_XDECREF(exc);
4629     Py_XDECREF(errorHandler);
4630     return NULL;
4631 }
4632
4633 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4634                                     PyObject *mapping)
4635 {
4636     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4637         PyErr_BadArgument();
4638         return NULL;
4639     }
4640     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4641                                    PyUnicode_GET_SIZE(unicode),
4642                                    mapping,
4643                                    NULL);
4644 }
4645
4646 /* create or adjust a UnicodeTranslateError */
4647 static void make_translate_exception(PyObject **exceptionObject,
4648     const Py_UNICODE *unicode, Py_ssize_t size,
4649     Py_ssize_t startpos, Py_ssize_t endpos,
4650     const char *reason)
4651 {
4652     if (*exceptionObject == NULL) {
4653         *exceptionObject = PyUnicodeTranslateError_Create(
4654             unicode, size, startpos, endpos, reason);
4655     }
4656     else {
4657         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4658             goto onError;
4659         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4660             goto onError;
4661         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4662             goto onError;
4663         return;
4664         onError:
4665         Py_DECREF(*exceptionObject);
4666         *exceptionObject = NULL;
4667     }
4668 }
4669
4670 /* raises a UnicodeTranslateError */
4671 static void raise_translate_exception(PyObject **exceptionObject,
4672     const Py_UNICODE *unicode, Py_ssize_t size,
4673     Py_ssize_t startpos, Py_ssize_t endpos,
4674     const char *reason)
4675 {
4676     make_translate_exception(exceptionObject,
4677         unicode, size, startpos, endpos, reason);
4678     if (*exceptionObject != NULL)
4679         PyCodec_StrictErrors(*exceptionObject);
4680 }
4681
4682 /* error handling callback helper:
4683    build arguments, call the callback and check the arguments,
4684    put the result into newpos and return the replacement string, which
4685    has to be freed by the caller */
4686 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4687     PyObject **errorHandler,
4688     const char *reason,
4689     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4690     Py_ssize_t startpos, Py_ssize_t endpos,
4691     Py_ssize_t *newpos)
4692 {
4693     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4694
4695     Py_ssize_t i_newpos;
4696     PyObject *restuple;
4697     PyObject *resunicode;
4698
4699     if (*errorHandler == NULL) {
4700         *errorHandler = PyCodec_LookupError(errors);
4701         if (*errorHandler == NULL)
4702             return NULL;
4703     }
4704
4705     make_translate_exception(exceptionObject,
4706         unicode, size, startpos, endpos, reason);
4707     if (*exceptionObject == NULL)
4708         return NULL;
4709
4710     restuple = PyObject_CallFunctionObjArgs(
4711         *errorHandler, *exceptionObject, NULL);
4712     if (restuple == NULL)
4713         return NULL;
4714     if (!PyTuple_Check(restuple)) {
4715         PyErr_Format(PyExc_TypeError, &argparse[4]);
4716         Py_DECREF(restuple);
4717         return NULL;
4718     }
4719     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4720         &resunicode, &i_newpos)) {
4721         Py_DECREF(restuple);
4722         return NULL;
4723     }
4724     if (i_newpos<0)
4725         *newpos = size+i_newpos;
4726     else
4727         *newpos = i_newpos;
4728     if (*newpos<0 || *newpos>size) {
4729         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4730         Py_DECREF(restuple);
4731         return NULL;
4732     }
4733     Py_INCREF(resunicode);
4734     Py_DECREF(restuple);
4735     return resunicode;
4736 }
4737
4738 /* Lookup the character ch in the mapping and put the result in result,
4739    which must be decrefed by the caller.
4740    Return 0 on success, -1 on error */
4741 static
4742 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4743 {
4744     PyObject *w = PyInt_FromLong((long)c);
4745     PyObject *x;
4746
4747     if (w == NULL)
4748          return -1;
4749     x = PyObject_GetItem(mapping, w);
4750     Py_DECREF(w);
4751     if (x == NULL) {
4752         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4753             /* No mapping found means: use 1:1 mapping. */
4754             PyErr_Clear();
4755             *result = NULL;
4756             return 0;
4757         } else
4758             return -1;
4759     }
4760     else if (x == Py_None) {
4761         *result = x;
4762         return 0;
4763     }
4764     else if (PyInt_Check(x)) {
4765         long value = PyInt_AS_LONG(x);
4766         long max = PyUnicode_GetMax();
4767         if (value < 0 || value > max) {
4768             PyErr_Format(PyExc_TypeError,
4769                              "character mapping must be in range(0x%lx)", max+1);
4770             Py_DECREF(x);
4771             return -1;
4772         }
4773         *result = x;
4774         return 0;
4775     }
4776     else if (PyUnicode_Check(x)) {
4777         *result = x;
4778         return 0;
4779     }
4780     else {
4781         /* wrong return value */
4782         PyErr_SetString(PyExc_TypeError,
4783               "character mapping must return integer, None or unicode");
4784         Py_DECREF(x);
4785         return -1;
4786     }
4787 }
4788 /* ensure that *outobj is at least requiredsize characters long,
4789 if not reallocate and adjust various state variables.
4790 Return 0 on success, -1 on error */
4791 static
4792 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4793     Py_ssize_t requiredsize)
4794 {
4795     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4796     if (requiredsize > oldsize) {
4797         /* remember old output position */
4798         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4799         /* exponentially overallocate to minimize reallocations */
4800         if (requiredsize < 2 * oldsize)
4801             requiredsize = 2 * oldsize;
4802         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4803             return -1;
4804         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4805     }
4806     return 0;
4807 }
4808 /* lookup the character, put the result in the output string and adjust
4809    various state variables. Return a new reference to the object that
4810    was put in the output buffer in *result, or Py_None, if the mapping was
4811    undefined (in which case no character was written).
4812    The called must decref result.
4813    Return 0 on success, -1 on error. */
4814 static
4815 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4816     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4817     PyObject **res)
4818 {
4819     if (charmaptranslate_lookup(*curinp, mapping, res))
4820         return -1;
4821     if (*res==NULL) {
4822         /* not found => default to 1:1 mapping */
4823         *(*outp)++ = *curinp;
4824     }
4825     else if (*res==Py_None)
4826         ;
4827     else if (PyInt_Check(*res)) {
4828         /* no overflow check, because we know that the space is enough */
4829         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4830     }
4831     else if (PyUnicode_Check(*res)) {
4832         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4833         if (repsize==1) {
4834             /* no overflow check, because we know that the space is enough */
4835             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4836         }
4837         else if (repsize!=0) {
4838             /* more than one character */
4839             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4840                 (insize - (curinp-startinp)) +
4841                 repsize - 1;
4842             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4843                 return -1;
4844             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4845             *outp += repsize;
4846         }
4847     }
4848     else
4849         return -1;
4850     return 0;
4851 }
4852
4853 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4854                                      Py_ssize_t size,
4855                                      PyObject *mapping,
4856                                      const char *errors)
4857 {
4858     /* output object */
4859     PyObject *res = NULL;
4860     /* pointers to the beginning and end+1 of input */
4861     const Py_UNICODE *startp = p;
4862     const Py_UNICODE *endp = p + size;
4863     /* pointer into the output */
4864     Py_UNICODE *str;
4865     /* current output position */
4866     Py_ssize_t respos = 0;
4867     char *reason = "character maps to <undefined>";
4868     PyObject *errorHandler = NULL;
4869     PyObject *exc = NULL;
4870     /* the following variable is used for caching string comparisons
4871      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4872      * 3=ignore, 4=xmlcharrefreplace */
4873     int known_errorHandler = -1;
4874
4875     if (mapping == NULL) {
4876         PyErr_BadArgument();
4877         return NULL;
4878     }
4879
4880     /* allocate enough for a simple 1:1 translation without
4881        replacements, if we need more, we'll resize */
4882     res = PyUnicode_FromUnicode(NULL, size);
4883     if (res == NULL)
4884         goto onError;
4885     if (size == 0)
4886         return res;
4887     str = PyUnicode_AS_UNICODE(res);
4888
4889     while (p<endp) {
4890         /* try to encode it */
4891         PyObject *x = NULL;
4892         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4893             Py_XDECREF(x);
4894             goto onError;
4895         }
4896         Py_XDECREF(x);
4897         if (x!=Py_None) /* it worked => adjust input pointer */
4898             ++p;
4899         else { /* untranslatable character */
4900             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4901             Py_ssize_t repsize;
4902             Py_ssize_t newpos;
4903             Py_UNICODE *uni2;
4904             /* startpos for collecting untranslatable chars */
4905             const Py_UNICODE *collstart = p;
4906             const Py_UNICODE *collend = p+1;
4907             const Py_UNICODE *coll;
4908
4909             /* find all untranslatable characters */
4910             while (collend < endp) {
4911                 if (charmaptranslate_lookup(*collend, mapping, &x))
4912                     goto onError;
4913                 Py_XDECREF(x);
4914                 if (x!=Py_None)
4915                     break;
4916                 ++collend;
4917             }
4918             /* cache callback name lookup
4919              * (if not done yet, i.e. it's the first error) */
4920             if (known_errorHandler==-1) {
4921                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4922                     known_errorHandler = 1;
4923                 else if (!strcmp(errors, "replace"))
4924                     known_errorHandler = 2;
4925                 else if (!strcmp(errors, "ignore"))
4926                     known_errorHandler = 3;
4927                 else if (!strcmp(errors, "xmlcharrefreplace"))
4928                     known_errorHandler = 4;
4929                 else
4930                     known_errorHandler = 0;
4931             }
4932             switch (known_errorHandler) {
4933                 case 1: /* strict */
4934                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4935                     goto onError;
4936                 case 2: /* replace */
4937                     /* No need to check for space, this is a 1:1 replacement */
4938                     for (coll = collstart; coll<collend; ++coll)
4939                         *str++ = '?';
4940                     /* fall through */
4941                 case 3: /* ignore */
4942                     p = collend;
4943                     break;
4944                 case 4: /* xmlcharrefreplace */
4945                     /* generate replacement (temporarily (mis)uses p) */
4946                     for (p = collstart; p < collend; ++p) {
4947                         char buffer[2+29+1+1];
4948                         char *cp;
4949                         sprintf(buffer, "&#%d;", (int)*p);
4950                         if (charmaptranslate_makespace(&res, &str,
4951                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4952                             goto onError;
4953                         for (cp = buffer; *cp; ++cp)
4954                             *str++ = *cp;
4955                     }
4956                     p = collend;
4957                     break;
4958                 default:
4959                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4960                         reason, startp, size, &exc,
4961                         collstart-startp, collend-startp, &newpos);
4962                     if (repunicode == NULL)
4963                         goto onError;
4964                     /* generate replacement  */
4965                     repsize = PyUnicode_GET_SIZE(repunicode);
4966                     if (charmaptranslate_makespace(&res, &str,
4967                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4968                         Py_DECREF(repunicode);
4969                         goto onError;
4970                     }
4971                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4972                         *str++ = *uni2;
4973                     p = startp + newpos;
4974                     Py_DECREF(repunicode);
4975             }
4976         }
4977     }
4978     /* Resize if we allocated to much */
4979     respos = str-PyUnicode_AS_UNICODE(res);
4980     if (respos<PyUnicode_GET_SIZE(res)) {
4981         if (PyUnicode_Resize(&res, respos) < 0)
4982             goto onError;
4983     }
4984     Py_XDECREF(exc);
4985     Py_XDECREF(errorHandler);
4986     return res;
4987
4988     onError:
4989     Py_XDECREF(res);
4990     Py_XDECREF(exc);
4991     Py_XDECREF(errorHandler);
4992     return NULL;
4993 }
4994
4995 PyObject *PyUnicode_Translate(PyObject *str,
4996                               PyObject *mapping,
4997                               const char *errors)
4998 {
4999     PyObject *result;
5000
5001     str = PyUnicode_FromObject(str);
5002     if (str == NULL)
5003         goto onError;
5004     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5005                                         PyUnicode_GET_SIZE(str),
5006                                         mapping,
5007                                         errors);
5008     Py_DECREF(str);
5009     return result;
5010
5011  onError:
5012     Py_XDECREF(str);
5013     return NULL;
5014 }
5015
5016 /* --- Decimal Encoder ---------------------------------------------------- */
5017
5018 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5019                             Py_ssize_t length,
5020                             char *output,
5021                             const char *errors)
5022 {
5023     Py_UNICODE *p, *end;
5024     PyObject *errorHandler = NULL;
5025     PyObject *exc = NULL;
5026     const char *encoding = "decimal";
5027     const char *reason = "invalid decimal Unicode string";
5028     /* the following variable is used for caching string comparisons
5029      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5030     int known_errorHandler = -1;
5031
5032     if (output == NULL) {
5033         PyErr_BadArgument();
5034         return -1;
5035     }
5036
5037     p = s;
5038     end = s + length;
5039     while (p < end) {
5040         register Py_UNICODE ch = *p;
5041         int decimal;
5042         PyObject *repunicode;
5043         Py_ssize_t repsize;
5044         Py_ssize_t newpos;
5045         Py_UNICODE *uni2;
5046         Py_UNICODE *collstart;
5047         Py_UNICODE *collend;
5048
5049         if (Py_UNICODE_ISSPACE(ch)) {
5050             *output++ = ' ';
5051             ++p;
5052             continue;
5053         }
5054         decimal = Py_UNICODE_TODECIMAL(ch);
5055         if (decimal >= 0) {
5056             *output++ = '0' + decimal;
5057             ++p;
5058             continue;
5059         }
5060         if (0 < ch && ch < 256) {
5061             *output++ = (char)ch;
5062             ++p;
5063             continue;
5064         }
5065         /* All other characters are considered unencodable */
5066         collstart = p;
5067         collend = p+1;
5068         while (collend < end) {
5069             if ((0 < *collend && *collend < 256) ||
5070                 !Py_UNICODE_ISSPACE(*collend) ||
5071                 Py_UNICODE_TODECIMAL(*collend))
5072                 break;
5073         }
5074         /* cache callback name lookup
5075          * (if not done yet, i.e. it's the first error) */
5076         if (known_errorHandler==-1) {
5077             if ((errors==NULL) || (!strcmp(errors, "strict")))
5078                 known_errorHandler = 1;
5079             else if (!strcmp(errors, "replace"))
5080                 known_errorHandler = 2;
5081             else if (!strcmp(errors, "ignore"))
5082                 known_errorHandler = 3;
5083             else if (!strcmp(errors, "xmlcharrefreplace"))
5084                 known_errorHandler = 4;
5085             else
5086                 known_errorHandler = 0;
5087         }
5088         switch (known_errorHandler) {
5089             case 1: /* strict */
5090                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5091                 goto onError;
5092             case 2: /* replace */
5093                 for (p = collstart; p < collend; ++p)
5094                     *output++ = '?';
5095                 /* fall through */
5096             case 3: /* ignore */
5097                 p = collend;
5098                 break;
5099             case 4: /* xmlcharrefreplace */
5100                 /* generate replacement (temporarily (mis)uses p) */
5101                 for (p = collstart; p < collend; ++p)
5102                     output += sprintf(output, "&#%d;", (int)*p);
5103                 p = collend;
5104                 break;
5105             default:
5106                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5107                     encoding, reason, s, length, &exc,
5108                     collstart-s, collend-s, &newpos);
5109                 if (repunicode == NULL)
5110                     goto onError;
5111                 /* generate replacement  */
5112                 repsize = PyUnicode_GET_SIZE(repunicode);
5113                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5114                     Py_UNICODE ch = *uni2;
5115                     if (Py_UNICODE_ISSPACE(ch))
5116                         *output++ = ' ';
5117                     else {
5118                         decimal = Py_UNICODE_TODECIMAL(ch);
5119                         if (decimal >= 0)
5120                             *output++ = '0' + decimal;
5121                         else if (0 < ch && ch < 256)
5122                             *output++ = (char)ch;
5123                         else {
5124                             Py_DECREF(repunicode);
5125                             raise_encode_exception(&exc, encoding,
5126                                 s, length, collstart-s, collend-s, reason);
5127                             goto onError;
5128                         }
5129                     }
5130                 }
5131                 p = s + newpos;
5132                 Py_DECREF(repunicode);
5133         }
5134     }
5135     /* 0-terminate the output string */
5136     *output++ = '\0';
5137     Py_XDECREF(exc);
5138     Py_XDECREF(errorHandler);
5139     return 0;
5140
5141  onError:
5142     Py_XDECREF(exc);
5143     Py_XDECREF(errorHandler);
5144     return -1;
5145 }
5146
5147 /* --- Helpers ------------------------------------------------------------ */
5148
5149 #include "stringlib/unicodedefs.h"
5150
5151 #define FROM_UNICODE
5152
5153 #include "stringlib/fastsearch.h"
5154
5155 #include "stringlib/count.h"
5156 #include "stringlib/find.h"
5157 #include "stringlib/partition.h"
5158
5159 /* helper macro to fixup start/end slice values */
5160 #define FIX_START_END(obj)                      \
5161     if (start < 0)                              \
5162         start += (obj)->length;                 \
5163     if (start < 0)                              \
5164         start = 0;                              \
5165     if (end > (obj)->length)                    \
5166         end = (obj)->length;                    \
5167     if (end < 0)                                \
5168         end += (obj)->length;                   \
5169     if (end < 0)                                \
5170         end = 0;
5171
5172 Py_ssize_t PyUnicode_Count(PyObject *str,
5173                            PyObject *substr,
5174                            Py_ssize_t start,
5175                            Py_ssize_t end)
5176 {
5177     Py_ssize_t result;
5178     PyUnicodeObject* str_obj;
5179     PyUnicodeObject* sub_obj;
5180
5181     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5182     if (!str_obj)
5183         return -1;
5184     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5185     if (!sub_obj) {
5186         Py_DECREF(str_obj);
5187         return -1;
5188     }
5189
5190     FIX_START_END(str_obj);
5191
5192     result = stringlib_count(
5193         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5194         );
5195
5196     Py_DECREF(sub_obj);
5197     Py_DECREF(str_obj);
5198
5199     return result;
5200 }
5201
5202 Py_ssize_t PyUnicode_Find(PyObject *str,
5203                           PyObject *sub,
5204                           Py_ssize_t start,
5205                           Py_ssize_t end,
5206                           int direction)
5207 {
5208     Py_ssize_t result;
5209
5210     str = PyUnicode_FromObject(str);
5211     if (!str)
5212         return -2;
5213     sub = PyUnicode_FromObject(sub);
5214     if (!sub) {
5215         Py_DECREF(str);
5216         return -2;
5217     }
5218
5219     if (direction > 0)
5220         result = stringlib_find_slice(
5221             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5222             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5223             start, end
5224             );
5225     else
5226         result = stringlib_rfind_slice(
5227             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5228             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5229             start, end
5230             );
5231
5232     Py_DECREF(str);
5233     Py_DECREF(sub);
5234
5235     return result;
5236 }
5237
5238 static
5239 int tailmatch(PyUnicodeObject *self,
5240               PyUnicodeObject *substring,
5241               Py_ssize_t start,
5242               Py_ssize_t end,
5243               int direction)
5244 {
5245     if (substring->length == 0)
5246         return 1;
5247
5248     FIX_START_END(self);
5249
5250     end -= substring->length;
5251     if (end < start)
5252         return 0;
5253
5254     if (direction > 0) {
5255         if (Py_UNICODE_MATCH(self, end, substring))
5256             return 1;
5257     } else {
5258         if (Py_UNICODE_MATCH(self, start, substring))
5259             return 1;
5260     }
5261
5262     return 0;
5263 }
5264
5265 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5266                         PyObject *substr,
5267                         Py_ssize_t start,
5268                         Py_ssize_t end,
5269                         int direction)
5270 {
5271     Py_ssize_t result;
5272
5273     str = PyUnicode_FromObject(str);
5274     if (str == NULL)
5275         return -1;
5276     substr = PyUnicode_FromObject(substr);
5277     if (substr == NULL) {
5278         Py_DECREF(str);
5279         return -1;
5280     }
5281
5282     result = tailmatch((PyUnicodeObject *)str,
5283                        (PyUnicodeObject *)substr,
5284                        start, end, direction);
5285     Py_DECREF(str);
5286     Py_DECREF(substr);
5287     return result;
5288 }
5289
5290 /* Apply fixfct filter to the Unicode object self and return a
5291    reference to the modified object */
5292
5293 static
5294 PyObject *fixup(PyUnicodeObject *self,
5295                 int (*fixfct)(PyUnicodeObject *s))
5296 {
5297
5298     PyUnicodeObject *u;
5299
5300     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5301     if (u == NULL)
5302         return NULL;
5303
5304     Py_UNICODE_COPY(u->str, self->str, self->length);
5305
5306     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5307         /* fixfct should return TRUE if it modified the buffer. If
5308            FALSE, return a reference to the original buffer instead
5309            (to save space, not time) */
5310         Py_INCREF(self);
5311         Py_DECREF(u);
5312         return (PyObject*) self;
5313     }
5314     return (PyObject*) u;
5315 }
5316
5317 static
5318 int fixupper(PyUnicodeObject *self)
5319 {
5320     Py_ssize_t len = self->length;
5321     Py_UNICODE *s = self->str;
5322     int status = 0;
5323
5324     while (len-- > 0) {
5325         register Py_UNICODE ch;
5326
5327         ch = Py_UNICODE_TOUPPER(*s);
5328         if (ch != *s) {
5329             status = 1;
5330             *s = ch;
5331         }
5332         s++;
5333     }
5334
5335     return status;
5336 }
5337
5338 static
5339 int fixlower(PyUnicodeObject *self)
5340 {
5341     Py_ssize_t len = self->length;
5342     Py_UNICODE *s = self->str;
5343     int status = 0;
5344
5345     while (len-- > 0) {
5346         register Py_UNICODE ch;
5347
5348         ch = Py_UNICODE_TOLOWER(*s);
5349         if (ch != *s) {
5350             status = 1;
5351             *s = ch;
5352         }
5353         s++;
5354     }
5355
5356     return status;
5357 }
5358
5359 static
5360 int fixswapcase(PyUnicodeObject *self)
5361 {
5362     Py_ssize_t len = self->length;
5363     Py_UNICODE *s = self->str;
5364     int status = 0;
5365
5366     while (len-- > 0) {
5367         if (Py_UNICODE_ISUPPER(*s)) {
5368             *s = Py_UNICODE_TOLOWER(*s);
5369             status = 1;
5370         } else if (Py_UNICODE_ISLOWER(*s)) {
5371             *s = Py_UNICODE_TOUPPER(*s);
5372             status = 1;
5373         }
5374         s++;
5375     }
5376
5377     return status;
5378 }
5379
5380 static
5381 int fixcapitalize(PyUnicodeObject *self)
5382 {
5383     Py_ssize_t len = self->length;
5384     Py_UNICODE *s = self->str;
5385     int status = 0;
5386
5387     if (len == 0)
5388         return 0;
5389     if (Py_UNICODE_ISLOWER(*s)) {
5390         *s = Py_UNICODE_TOUPPER(*s);
5391         status = 1;
5392     }
5393     s++;
5394     while (--len > 0) {
5395         if (Py_UNICODE_ISUPPER(*s)) {
5396             *s = Py_UNICODE_TOLOWER(*s);
5397             status = 1;
5398         }
5399         s++;
5400     }
5401     return status;
5402 }
5403
5404 static
5405 int fixtitle(PyUnicodeObject *self)
5406 {
5407     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5408     register Py_UNICODE *e;
5409     int previous_is_cased;
5410
5411     /* Shortcut for single character strings */
5412     if (PyUnicode_GET_SIZE(self) == 1) {
5413         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5414         if (*p != ch) {
5415             *p = ch;
5416             return 1;
5417         }
5418         else
5419             return 0;
5420     }
5421
5422     e = p + PyUnicode_GET_SIZE(self);
5423     previous_is_cased = 0;
5424     for (; p < e; p++) {
5425         register const Py_UNICODE ch = *p;
5426
5427         if (previous_is_cased)
5428             *p = Py_UNICODE_TOLOWER(ch);
5429         else
5430             *p = Py_UNICODE_TOTITLE(ch);
5431
5432         if (Py_UNICODE_ISLOWER(ch) ||
5433             Py_UNICODE_ISUPPER(ch) ||
5434             Py_UNICODE_ISTITLE(ch))
5435             previous_is_cased = 1;
5436         else
5437             previous_is_cased = 0;
5438     }
5439     return 1;
5440 }
5441
5442 PyObject *
5443 PyUnicode_Join(PyObject *separator, PyObject *seq)
5444 {
5445     PyObject *internal_separator = NULL;
5446     const Py_UNICODE blank = ' ';
5447     const Py_UNICODE *sep = &blank;
5448     Py_ssize_t seplen = 1;
5449     PyUnicodeObject *res = NULL; /* the result */
5450     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5451     Py_ssize_t res_used;         /* # used bytes */
5452     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5453     PyObject *fseq;          /* PySequence_Fast(seq) */
5454     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5455     PyObject *item;
5456     Py_ssize_t i;
5457
5458     fseq = PySequence_Fast(seq, "");
5459     if (fseq == NULL) {
5460         return NULL;
5461     }
5462
5463     /* Grrrr.  A codec may be invoked to convert str objects to
5464      * Unicode, and so it's possible to call back into Python code
5465      * during PyUnicode_FromObject(), and so it's possible for a sick
5466      * codec to change the size of fseq (if seq is a list).  Therefore
5467      * we have to keep refetching the size -- can't assume seqlen
5468      * is invariant.
5469      */
5470     seqlen = PySequence_Fast_GET_SIZE(fseq);
5471     /* If empty sequence, return u"". */
5472     if (seqlen == 0) {
5473         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5474         goto Done;
5475     }
5476     /* If singleton sequence with an exact Unicode, return that. */
5477     if (seqlen == 1) {
5478         item = PySequence_Fast_GET_ITEM(fseq, 0);
5479         if (PyUnicode_CheckExact(item)) {
5480             Py_INCREF(item);
5481             res = (PyUnicodeObject *)item;
5482             goto Done;
5483         }
5484     }
5485
5486     /* At least two items to join, or one that isn't exact Unicode. */
5487     if (seqlen > 1) {
5488         /* Set up sep and seplen -- they're needed. */
5489         if (separator == NULL) {
5490             sep = &blank;
5491             seplen = 1;
5492         }
5493         else {
5494             internal_separator = PyUnicode_FromObject(separator);
5495             if (internal_separator == NULL)
5496                 goto onError;
5497             sep = PyUnicode_AS_UNICODE(internal_separator);
5498             seplen = PyUnicode_GET_SIZE(internal_separator);
5499             /* In case PyUnicode_FromObject() mutated seq. */
5500             seqlen = PySequence_Fast_GET_SIZE(fseq);
5501         }
5502     }
5503
5504     /* Get space. */
5505     res = _PyUnicode_New(res_alloc);
5506     if (res == NULL)
5507         goto onError;
5508     res_p = PyUnicode_AS_UNICODE(res);
5509     res_used = 0;
5510
5511     for (i = 0; i < seqlen; ++i) {
5512         Py_ssize_t itemlen;
5513         Py_ssize_t new_res_used;
5514
5515         item = PySequence_Fast_GET_ITEM(fseq, i);
5516         /* Convert item to Unicode. */
5517         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5518             PyErr_Format(PyExc_TypeError,
5519                          "sequence item %zd: expected string or Unicode,"
5520                          " %.80s found",
5521                          i, Py_TYPE(item)->tp_name);
5522             goto onError;
5523         }
5524         item = PyUnicode_FromObject(item);
5525         if (item == NULL)
5526             goto onError;
5527         /* We own a reference to item from here on. */
5528
5529         /* In case PyUnicode_FromObject() mutated seq. */
5530         seqlen = PySequence_Fast_GET_SIZE(fseq);
5531
5532         /* Make sure we have enough space for the separator and the item. */
5533         itemlen = PyUnicode_GET_SIZE(item);
5534         new_res_used = res_used + itemlen;
5535         if (new_res_used < 0)
5536             goto Overflow;
5537         if (i < seqlen - 1) {
5538             new_res_used += seplen;
5539             if (new_res_used < 0)
5540                 goto Overflow;
5541         }
5542         if (new_res_used > res_alloc) {
5543             /* double allocated size until it's big enough */
5544             do {
5545                 res_alloc += res_alloc;
5546                 if (res_alloc <= 0)
5547                     goto Overflow;
5548             } while (new_res_used > res_alloc);
5549             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5550                 Py_DECREF(item);
5551                 goto onError;
5552             }
5553             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5554         }
5555
5556         /* Copy item, and maybe the separator. */
5557         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5558         res_p += itemlen;
5559         if (i < seqlen - 1) {
5560             Py_UNICODE_COPY(res_p, sep, seplen);
5561             res_p += seplen;
5562         }
5563         Py_DECREF(item);
5564         res_used = new_res_used;
5565     }
5566
5567     /* Shrink res to match the used area; this probably can't fail,
5568      * but it's cheap to check.
5569      */
5570     if (_PyUnicode_Resize(&res, res_used) < 0)
5571         goto onError;
5572
5573  Done:
5574     Py_XDECREF(internal_separator);
5575     Py_DECREF(fseq);
5576     return (PyObject *)res;
5577
5578  Overflow:
5579     PyErr_SetString(PyExc_OverflowError,
5580                     "join() result is too long for a Python string");
5581     Py_DECREF(item);
5582     /* fall through */
5583
5584  onError:
5585     Py_XDECREF(internal_separator);
5586     Py_DECREF(fseq);
5587     Py_XDECREF(res);
5588     return NULL;
5589 }
5590
5591 static
5592 PyUnicodeObject *pad(PyUnicodeObject *self,
5593                      Py_ssize_t left,
5594                      Py_ssize_t right,
5595                      Py_UNICODE fill)
5596 {
5597     PyUnicodeObject *u;
5598
5599     if (left < 0)
5600         left = 0;
5601     if (right < 0)
5602         right = 0;
5603
5604     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5605         Py_INCREF(self);
5606         return self;
5607     }
5608
5609     if (left > PY_SSIZE_T_MAX - self->length ||
5610         right > PY_SSIZE_T_MAX - (left + self->length)) {
5611         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5612         return NULL;
5613     }
5614     u = _PyUnicode_New(left + self->length + right);
5615     if (u) {
5616         if (left)
5617             Py_UNICODE_FILL(u->str, fill, left);
5618         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5619         if (right)
5620             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5621     }
5622
5623     return u;
5624 }
5625
5626 #define SPLIT_APPEND(data, left, right)                                 \
5627         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5628         if (!str)                                                       \
5629             goto onError;                                               \
5630         if (PyList_Append(list, str)) {                                 \
5631             Py_DECREF(str);                                             \
5632             goto onError;                                               \
5633         }                                                               \
5634         else                                                            \
5635             Py_DECREF(str);
5636
5637 static
5638 PyObject *split_whitespace(PyUnicodeObject *self,
5639                            PyObject *list,
5640                            Py_ssize_t maxcount)
5641 {
5642     register Py_ssize_t i;
5643     register Py_ssize_t j;
5644     Py_ssize_t len = self->length;
5645     PyObject *str;
5646     register const Py_UNICODE *buf = self->str;
5647
5648     for (i = j = 0; i < len; ) {
5649         /* find a token */
5650         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5651             i++;
5652         j = i;
5653         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5654             i++;
5655         if (j < i) {
5656             if (maxcount-- <= 0)
5657                 break;
5658             SPLIT_APPEND(buf, j, i);
5659             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5660                 i++;
5661             j = i;
5662         }
5663     }
5664     if (j < len) {
5665         SPLIT_APPEND(buf, j, len);
5666     }
5667     return list;
5668
5669  onError:
5670     Py_DECREF(list);
5671     return NULL;
5672 }
5673
5674 PyObject *PyUnicode_Splitlines(PyObject *string,
5675                                int keepends)
5676 {
5677     register Py_ssize_t i;
5678     register Py_ssize_t j;
5679     Py_ssize_t len;
5680     PyObject *list;
5681     PyObject *str;
5682     Py_UNICODE *data;
5683
5684     string = PyUnicode_FromObject(string);
5685     if (string == NULL)
5686         return NULL;
5687     data = PyUnicode_AS_UNICODE(string);
5688     len = PyUnicode_GET_SIZE(string);
5689
5690     list = PyList_New(0);
5691     if (!list)
5692         goto onError;
5693
5694     for (i = j = 0; i < len; ) {
5695         Py_ssize_t eol;
5696
5697         /* Find a line and append it */
5698         while (i < len && !BLOOM_LINEBREAK(data[i]))
5699             i++;
5700
5701         /* Skip the line break reading CRLF as one line break */
5702         eol = i;
5703         if (i < len) {
5704             if (data[i] == '\r' && i + 1 < len &&
5705                 data[i+1] == '\n')
5706                 i += 2;
5707             else
5708                 i++;
5709             if (keepends)
5710                 eol = i;
5711         }
5712         SPLIT_APPEND(data, j, eol);
5713         j = i;
5714     }
5715     if (j < len) {
5716         SPLIT_APPEND(data, j, len);
5717     }
5718
5719     Py_DECREF(string);
5720     return list;
5721
5722  onError:
5723     Py_XDECREF(list);
5724     Py_DECREF(string);
5725     return NULL;
5726 }
5727
5728 static
5729 PyObject *split_char(PyUnicodeObject *self,
5730                      PyObject *list,
5731                      Py_UNICODE ch,
5732                      Py_ssize_t maxcount)
5733 {
5734     register Py_ssize_t i;
5735     register Py_ssize_t j;
5736     Py_ssize_t len = self->length;
5737     PyObject *str;
5738     register const Py_UNICODE *buf = self->str;
5739
5740     for (i = j = 0; i < len; ) {
5741         if (buf[i] == ch) {
5742             if (maxcount-- <= 0)
5743                 break;
5744             SPLIT_APPEND(buf, j, i);
5745             i = j = i + 1;
5746         } else
5747             i++;
5748     }
5749     if (j <= len) {
5750         SPLIT_APPEND(buf, j, len);
5751     }
5752     return list;
5753
5754  onError:
5755     Py_DECREF(list);
5756     return NULL;
5757 }
5758
5759 static
5760 PyObject *split_substring(PyUnicodeObject *self,
5761                           PyObject *list,
5762                           PyUnicodeObject *substring,
5763                           Py_ssize_t maxcount)
5764 {
5765     register Py_ssize_t i;
5766     register Py_ssize_t j;
5767     Py_ssize_t len = self->length;
5768     Py_ssize_t sublen = substring->length;
5769     PyObject *str;
5770
5771     for (i = j = 0; i <= len - sublen; ) {
5772         if (Py_UNICODE_MATCH(self, i, substring)) {
5773             if (maxcount-- <= 0)
5774                 break;
5775             SPLIT_APPEND(self->str, j, i);
5776             i = j = i + sublen;
5777         } else
5778             i++;
5779     }
5780     if (j <= len) {
5781         SPLIT_APPEND(self->str, j, len);
5782     }
5783     return list;
5784
5785  onError:
5786     Py_DECREF(list);
5787     return NULL;
5788 }
5789
5790 static
5791 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5792                             PyObject *list,
5793                             Py_ssize_t maxcount)
5794 {
5795     register Py_ssize_t i;
5796     register Py_ssize_t j;
5797     Py_ssize_t len = self->length;
5798     PyObject *str;
5799     register const Py_UNICODE *buf = self->str;
5800
5801     for (i = j = len - 1; i >= 0; ) {
5802         /* find a token */
5803         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5804             i--;
5805         j = i;
5806         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5807             i--;
5808         if (j > i) {
5809             if (maxcount-- <= 0)
5810                 break;
5811             SPLIT_APPEND(buf, i + 1, j + 1);
5812             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5813                 i--;
5814             j = i;
5815         }
5816     }
5817     if (j >= 0) {
5818         SPLIT_APPEND(buf, 0, j + 1);
5819     }
5820     if (PyList_Reverse(list) < 0)
5821         goto onError;
5822     return list;
5823
5824  onError:
5825     Py_DECREF(list);
5826     return NULL;
5827 }
5828
5829 static
5830 PyObject *rsplit_char(PyUnicodeObject *self,
5831                       PyObject *list,
5832                       Py_UNICODE ch,
5833                       Py_ssize_t maxcount)
5834 {
5835     register Py_ssize_t i;
5836     register Py_ssize_t j;
5837     Py_ssize_t len = self->length;
5838     PyObject *str;
5839     register const Py_UNICODE *buf = self->str;
5840
5841     for (i = j = len - 1; i >= 0; ) {
5842         if (buf[i] == ch) {
5843             if (maxcount-- <= 0)
5844                 break;
5845             SPLIT_APPEND(buf, i + 1, j + 1);
5846             j = i = i - 1;
5847         } else
5848             i--;
5849     }
5850     if (j >= -1) {
5851         SPLIT_APPEND(buf, 0, j + 1);
5852     }
5853     if (PyList_Reverse(list) < 0)
5854         goto onError;
5855     return list;
5856
5857  onError:
5858     Py_DECREF(list);
5859     return NULL;
5860 }
5861
5862 static
5863 PyObject *rsplit_substring(PyUnicodeObject *self,
5864                            PyObject *list,
5865                            PyUnicodeObject *substring,
5866                            Py_ssize_t maxcount)
5867 {
5868     register Py_ssize_t i;
5869     register Py_ssize_t j;
5870     Py_ssize_t len = self->length;
5871     Py_ssize_t sublen = substring->length;
5872     PyObject *str;
5873
5874     for (i = len - sublen, j = len; i >= 0; ) {
5875         if (Py_UNICODE_MATCH(self, i, substring)) {
5876             if (maxcount-- <= 0)
5877                 break;
5878             SPLIT_APPEND(self->str, i + sublen, j);
5879             j = i;
5880             i -= sublen;
5881         } else
5882             i--;
5883     }
5884     if (j >= 0) {
5885         SPLIT_APPEND(self->str, 0, j);
5886     }
5887     if (PyList_Reverse(list) < 0)
5888         goto onError;
5889     return list;
5890
5891  onError:
5892     Py_DECREF(list);
5893     return NULL;
5894 }
5895
5896 #undef SPLIT_APPEND
5897
5898 static
5899 PyObject *split(PyUnicodeObject *self,
5900                 PyUnicodeObject *substring,
5901                 Py_ssize_t maxcount)
5902 {
5903     PyObject *list;
5904
5905     if (maxcount < 0)
5906         maxcount = PY_SSIZE_T_MAX;
5907
5908     list = PyList_New(0);
5909     if (!list)
5910         return NULL;
5911
5912     if (substring == NULL)
5913         return split_whitespace(self,list,maxcount);
5914
5915     else if (substring->length == 1)
5916         return split_char(self,list,substring->str[0],maxcount);
5917
5918     else if (substring->length == 0) {
5919         Py_DECREF(list);
5920         PyErr_SetString(PyExc_ValueError, "empty separator");
5921         return NULL;
5922     }
5923     else
5924         return split_substring(self,list,substring,maxcount);
5925 }
5926
5927 static
5928 PyObject *rsplit(PyUnicodeObject *self,
5929                  PyUnicodeObject *substring,
5930                  Py_ssize_t maxcount)
5931 {
5932     PyObject *list;
5933
5934     if (maxcount < 0)
5935         maxcount = PY_SSIZE_T_MAX;
5936
5937     list = PyList_New(0);
5938     if (!list)
5939         return NULL;
5940
5941     if (substring == NULL)
5942         return rsplit_whitespace(self,list,maxcount);
5943
5944     else if (substring->length == 1)
5945         return rsplit_char(self,list,substring->str[0],maxcount);
5946
5947     else if (substring->length == 0) {
5948         Py_DECREF(list);
5949         PyErr_SetString(PyExc_ValueError, "empty separator");
5950         return NULL;
5951     }
5952     else
5953         return rsplit_substring(self,list,substring,maxcount);
5954 }
5955
5956 static
5957 PyObject *replace(PyUnicodeObject *self,
5958                   PyUnicodeObject *str1,
5959                   PyUnicodeObject *str2,
5960                   Py_ssize_t maxcount)
5961 {
5962     PyUnicodeObject *u;
5963
5964     if (maxcount < 0)
5965         maxcount = PY_SSIZE_T_MAX;
5966
5967     if (str1->length == str2->length) {
5968         /* same length */
5969         Py_ssize_t i;
5970         if (str1->length == 1) {
5971             /* replace characters */
5972             Py_UNICODE u1, u2;
5973             if (!findchar(self->str, self->length, str1->str[0]))
5974                 goto nothing;
5975             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5976             if (!u)
5977                 return NULL;
5978             Py_UNICODE_COPY(u->str, self->str, self->length);
5979             u1 = str1->str[0];
5980             u2 = str2->str[0];
5981             for (i = 0; i < u->length; i++)
5982                 if (u->str[i] == u1) {
5983                     if (--maxcount < 0)
5984                         break;
5985                     u->str[i] = u2;
5986                 }
5987         } else {
5988             i = fastsearch(
5989                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5990                 );
5991             if (i < 0)
5992                 goto nothing;
5993             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5994             if (!u)
5995                 return NULL;
5996             Py_UNICODE_COPY(u->str, self->str, self->length);
5997             while (i <= self->length - str1->length)
5998                 if (Py_UNICODE_MATCH(self, i, str1)) {
5999                     if (--maxcount < 0)
6000                         break;
6001                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6002                     i += str1->length;
6003                 } else
6004                     i++;
6005         }
6006     } else {
6007
6008         Py_ssize_t n, i, j, e;
6009         Py_ssize_t product, new_size, delta;
6010         Py_UNICODE *p;
6011
6012         /* replace strings */
6013         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6014         if (n > maxcount)
6015             n = maxcount;
6016         if (n == 0)
6017             goto nothing;
6018         /* new_size = self->length + n * (str2->length - str1->length)); */
6019         delta = (str2->length - str1->length);
6020         if (delta == 0) {
6021             new_size = self->length;
6022         } else {
6023             product = n * (str2->length - str1->length);
6024             if ((product / (str2->length - str1->length)) != n) {
6025                 PyErr_SetString(PyExc_OverflowError,
6026                                 "replace string is too long");
6027                 return NULL;
6028             }
6029             new_size = self->length + product;
6030             if (new_size < 0) {
6031                 PyErr_SetString(PyExc_OverflowError,
6032                                 "replace string is too long");
6033                 return NULL;
6034             }
6035         }
6036         u = _PyUnicode_New(new_size);
6037         if (!u)
6038             return NULL;
6039         i = 0;
6040         p = u->str;
6041         e = self->length - str1->length;
6042         if (str1->length > 0) {
6043             while (n-- > 0) {
6044                 /* look for next match */
6045                 j = i;
6046                 while (j <= e) {
6047                     if (Py_UNICODE_MATCH(self, j, str1))
6048                         break;
6049                     j++;
6050                 }
6051                 if (j > i) {
6052                     if (j > e)
6053                         break;
6054                     /* copy unchanged part [i:j] */
6055                     Py_UNICODE_COPY(p, self->str+i, j-i);
6056                     p += j - i;
6057                 }
6058                 /* copy substitution string */
6059                 if (str2->length > 0) {
6060                     Py_UNICODE_COPY(p, str2->str, str2->length);
6061                     p += str2->length;
6062                 }
6063                 i = j + str1->length;
6064             }
6065             if (i < self->length)
6066                 /* copy tail [i:] */
6067                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6068         } else {
6069             /* interleave */
6070             while (n > 0) {
6071                 Py_UNICODE_COPY(p, str2->str, str2->length);
6072                 p += str2->length;
6073                 if (--n <= 0)
6074                     break;
6075                 *p++ = self->str[i++];
6076             }
6077             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6078         }
6079     }
6080     return (PyObject *) u;
6081
6082 nothing:
6083     /* nothing to replace; return original string (when possible) */
6084     if (PyUnicode_CheckExact(self)) {
6085         Py_INCREF(self);
6086         return (PyObject *) self;
6087     }
6088     return PyUnicode_FromUnicode(self->str, self->length);
6089 }
6090
6091 /* --- Unicode Object Methods --------------------------------------------- */
6092
6093 PyDoc_STRVAR(title__doc__,
6094 "S.title() -> unicode\n\
6095 \n\
6096 Return a titlecased version of S, i.e. words start with title case\n\
6097 characters, all remaining cased characters have lower case.");
6098
6099 static PyObject*
6100 unicode_title(PyUnicodeObject *self)
6101 {
6102     return fixup(self, fixtitle);
6103 }
6104
6105 PyDoc_STRVAR(capitalize__doc__,
6106 "S.capitalize() -> unicode\n\
6107 \n\
6108 Return a capitalized version of S, i.e. make the first character\n\
6109 have upper case.");
6110
6111 static PyObject*
6112 unicode_capitalize(PyUnicodeObject *self)
6113 {
6114     return fixup(self, fixcapitalize);
6115 }
6116
6117 #if 0
6118 PyDoc_STRVAR(capwords__doc__,
6119 "S.capwords() -> unicode\n\
6120 \n\
6121 Apply .capitalize() to all words in S and return the result with\n\
6122 normalized whitespace (all whitespace strings are replaced by ' ').");
6123
6124 static PyObject*
6125 unicode_capwords(PyUnicodeObject *self)
6126 {
6127     PyObject *list;
6128     PyObject *item;
6129     Py_ssize_t i;
6130
6131     /* Split into words */
6132     list = split(self, NULL, -1);
6133     if (!list)
6134         return NULL;
6135
6136     /* Capitalize each word */
6137     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6138         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6139                      fixcapitalize);
6140         if (item == NULL)
6141             goto onError;
6142         Py_DECREF(PyList_GET_ITEM(list, i));
6143         PyList_SET_ITEM(list, i, item);
6144     }
6145
6146     /* Join the words to form a new string */
6147     item = PyUnicode_Join(NULL, list);
6148
6149 onError:
6150     Py_DECREF(list);
6151     return (PyObject *)item;
6152 }
6153 #endif
6154
6155 /* Argument converter.  Coerces to a single unicode character */
6156
6157 static int
6158 convert_uc(PyObject *obj, void *addr)
6159 {
6160         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6161         PyObject *uniobj;
6162         Py_UNICODE *unistr;
6163
6164         uniobj = PyUnicode_FromObject(obj);
6165         if (uniobj == NULL) {
6166                 PyErr_SetString(PyExc_TypeError,
6167                         "The fill character cannot be converted to Unicode");
6168                 return 0;
6169         }
6170         if (PyUnicode_GET_SIZE(uniobj) != 1) {
6171                 PyErr_SetString(PyExc_TypeError,
6172                         "The fill character must be exactly one character long");
6173                 Py_DECREF(uniobj);
6174                 return 0;
6175         }
6176         unistr = PyUnicode_AS_UNICODE(uniobj);
6177         *fillcharloc = unistr[0];
6178         Py_DECREF(uniobj);
6179         return 1;
6180 }
6181
6182 PyDoc_STRVAR(center__doc__,
6183 "S.center(width[, fillchar]) -> unicode\n\
6184 \n\
6185 Return S centered in a Unicode string of length width. Padding is\n\
6186 done using the specified fill character (default is a space)");
6187
6188 static PyObject *
6189 unicode_center(PyUnicodeObject *self, PyObject *args)
6190 {
6191     Py_ssize_t marg, left;
6192     Py_ssize_t width;
6193     Py_UNICODE fillchar = ' ';
6194
6195     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6196         return NULL;
6197
6198     if (self->length >= width && PyUnicode_CheckExact(self)) {
6199         Py_INCREF(self);
6200         return (PyObject*) self;
6201     }
6202
6203     marg = width - self->length;
6204     left = marg / 2 + (marg & width & 1);
6205
6206     return (PyObject*) pad(self, left, marg - left, fillchar);
6207 }
6208
6209 #if 0
6210
6211 /* This code should go into some future Unicode collation support
6212    module. The basic comparison should compare ordinals on a naive
6213    basis (this is what Java does and thus JPython too). */
6214
6215 /* speedy UTF-16 code point order comparison */
6216 /* gleaned from: */
6217 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6218
6219 static short utf16Fixup[32] =
6220 {
6221     0, 0, 0, 0, 0, 0, 0, 0,
6222     0, 0, 0, 0, 0, 0, 0, 0,
6223     0, 0, 0, 0, 0, 0, 0, 0,
6224     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6225 };
6226
6227 static int
6228 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6229 {
6230     Py_ssize_t len1, len2;
6231
6232     Py_UNICODE *s1 = str1->str;
6233     Py_UNICODE *s2 = str2->str;
6234
6235     len1 = str1->length;
6236     len2 = str2->length;
6237
6238     while (len1 > 0 && len2 > 0) {
6239         Py_UNICODE c1, c2;
6240
6241         c1 = *s1++;
6242         c2 = *s2++;
6243
6244         if (c1 > (1<<11) * 26)
6245             c1 += utf16Fixup[c1>>11];
6246         if (c2 > (1<<11) * 26)
6247             c2 += utf16Fixup[c2>>11];
6248         /* now c1 and c2 are in UTF-32-compatible order */
6249
6250         if (c1 != c2)
6251             return (c1 < c2) ? -1 : 1;
6252
6253         len1--; len2--;
6254     }
6255
6256     return (len1 < len2) ? -1 : (len1 != len2);
6257 }
6258
6259 #else
6260
6261 static int
6262 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6263 {
6264     register Py_ssize_t len1, len2;
6265
6266     Py_UNICODE *s1 = str1->str;
6267     Py_UNICODE *s2 = str2->str;
6268
6269     len1 = str1->length;
6270     len2 = str2->length;
6271
6272     while (len1 > 0 && len2 > 0) {
6273         Py_UNICODE c1, c2;
6274
6275         c1 = *s1++;
6276         c2 = *s2++;
6277
6278         if (c1 != c2)
6279             return (c1 < c2) ? -1 : 1;
6280
6281         len1--; len2--;
6282     }
6283
6284     return (len1 < len2) ? -1 : (len1 != len2);
6285 }
6286
6287 #endif
6288
6289 int PyUnicode_Compare(PyObject *left,
6290                       PyObject *right)
6291 {
6292     PyUnicodeObject *u = NULL, *v = NULL;
6293     int result;
6294
6295     /* Coerce the two arguments */
6296     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6297     if (u == NULL)
6298         goto onError;
6299     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6300     if (v == NULL)
6301         goto onError;
6302
6303     /* Shortcut for empty or interned objects */
6304     if (v == u) {
6305         Py_DECREF(u);
6306         Py_DECREF(v);
6307         return 0;
6308     }
6309
6310     result = unicode_compare(u, v);
6311
6312     Py_DECREF(u);
6313     Py_DECREF(v);
6314     return result;
6315
6316 onError:
6317     Py_XDECREF(u);
6318     Py_XDECREF(v);
6319     return -1;
6320 }
6321
6322 PyObject *PyUnicode_RichCompare(PyObject *left,
6323                                 PyObject *right,
6324                                 int op)
6325 {
6326     int result;
6327
6328     result = PyUnicode_Compare(left, right);
6329     if (result == -1 && PyErr_Occurred())
6330         goto onError;
6331
6332     /* Convert the return value to a Boolean */
6333     switch (op) {
6334     case Py_EQ:
6335         result = (result == 0);
6336         break;
6337     case Py_NE:
6338         result = (result != 0);
6339         break;
6340     case Py_LE:
6341         result = (result <= 0);
6342         break;
6343     case Py_GE:
6344         result = (result >= 0);
6345         break;
6346     case Py_LT:
6347         result = (result == -1);
6348         break;
6349     case Py_GT:
6350         result = (result == 1);
6351         break;
6352     }
6353     return PyBool_FromLong(result);
6354
6355  onError:
6356
6357     /* Standard case
6358
6359        Type errors mean that PyUnicode_FromObject() could not convert
6360        one of the arguments (usually the right hand side) to Unicode,
6361        ie. we can't handle the comparison request. However, it is
6362        possible that the other object knows a comparison method, which
6363        is why we return Py_NotImplemented to give the other object a
6364        chance.
6365
6366     */
6367     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6368         PyErr_Clear();
6369         Py_INCREF(Py_NotImplemented);
6370         return Py_NotImplemented;
6371     }
6372     if (op != Py_EQ && op != Py_NE)
6373         return NULL;
6374
6375     /* Equality comparison.
6376
6377        This is a special case: we silence any PyExc_UnicodeDecodeError
6378        and instead turn it into a PyErr_UnicodeWarning.
6379
6380     */
6381     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6382         return NULL;
6383     PyErr_Clear();
6384     if (PyErr_Warn(PyExc_UnicodeWarning,
6385                    (op == Py_EQ) ?
6386                    "Unicode equal comparison "
6387                    "failed to convert both arguments to Unicode - "
6388                    "interpreting them as being unequal" :
6389                    "Unicode unequal comparison "
6390                    "failed to convert both arguments to Unicode - "
6391                    "interpreting them as being unequal"
6392                    ) < 0)
6393         return NULL;
6394     result = (op == Py_NE);
6395     return PyBool_FromLong(result);
6396 }
6397
6398 int PyUnicode_Contains(PyObject *container,
6399                        PyObject *element)
6400 {
6401     PyObject *str, *sub;
6402     int result;
6403
6404     /* Coerce the two arguments */
6405     sub = PyUnicode_FromObject(element);
6406     if (!sub) {
6407         PyErr_SetString(PyExc_TypeError,
6408             "'in <string>' requires string as left operand");
6409         return -1;
6410     }
6411
6412     str = PyUnicode_FromObject(container);
6413     if (!str) {
6414         Py_DECREF(sub);
6415         return -1;
6416     }
6417
6418     result = stringlib_contains_obj(str, sub);
6419
6420     Py_DECREF(str);
6421     Py_DECREF(sub);
6422
6423     return result;
6424 }
6425
6426 /* Concat to string or Unicode object giving a new Unicode object. */
6427
6428 PyObject *PyUnicode_Concat(PyObject *left,
6429                            PyObject *right)
6430 {
6431     PyUnicodeObject *u = NULL, *v = NULL, *w;
6432
6433     /* Coerce the two arguments */
6434     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6435     if (u == NULL)
6436         goto onError;
6437     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6438     if (v == NULL)
6439         goto onError;
6440
6441     /* Shortcuts */
6442     if (v == unicode_empty) {
6443         Py_DECREF(v);
6444         return (PyObject *)u;
6445     }
6446     if (u == unicode_empty) {
6447         Py_DECREF(u);
6448         return (PyObject *)v;
6449     }
6450
6451     /* Concat the two Unicode strings */
6452     w = _PyUnicode_New(u->length + v->length);
6453     if (w == NULL)
6454         goto onError;
6455     Py_UNICODE_COPY(w->str, u->str, u->length);
6456     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6457
6458     Py_DECREF(u);
6459     Py_DECREF(v);
6460     return (PyObject *)w;
6461
6462 onError:
6463     Py_XDECREF(u);
6464     Py_XDECREF(v);
6465     return NULL;
6466 }
6467
6468 PyDoc_STRVAR(count__doc__,
6469 "S.count(sub[, start[, end]]) -> int\n\
6470 \n\
6471 Return the number of non-overlapping occurrences of substring sub in\n\
6472 Unicode string S[start:end].  Optional arguments start and end are\n\
6473 interpreted as in slice notation.");
6474
6475 static PyObject *
6476 unicode_count(PyUnicodeObject *self, PyObject *args)
6477 {
6478     PyUnicodeObject *substring;
6479     Py_ssize_t start = 0;
6480     Py_ssize_t end = PY_SSIZE_T_MAX;
6481     PyObject *result;
6482
6483     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6484                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6485         return NULL;
6486
6487     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6488         (PyObject *)substring);
6489     if (substring == NULL)
6490         return NULL;
6491
6492     FIX_START_END(self);
6493
6494     result = PyInt_FromSsize_t(
6495         stringlib_count(self->str + start, end - start,
6496                         substring->str, substring->length)
6497         );
6498
6499     Py_DECREF(substring);
6500
6501     return result;
6502 }
6503
6504 PyDoc_STRVAR(encode__doc__,
6505 "S.encode([encoding[,errors]]) -> string or unicode\n\
6506 \n\
6507 Encodes S using the codec registered for encoding. encoding defaults\n\
6508 to the default encoding. errors may be given to set a different error\n\
6509 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6510 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6511 'xmlcharrefreplace' as well as any other name registered with\n\
6512 codecs.register_error that can handle UnicodeEncodeErrors.");
6513
6514 static PyObject *
6515 unicode_encode(PyUnicodeObject *self, PyObject *args)
6516 {
6517     char *encoding = NULL;
6518     char *errors = NULL;
6519     PyObject *v;
6520
6521     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6522         return NULL;
6523     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6524     if (v == NULL)
6525         goto onError;
6526     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6527         PyErr_Format(PyExc_TypeError,
6528                      "encoder did not return a string/unicode object "
6529                      "(type=%.400s)",
6530                      Py_TYPE(v)->tp_name);
6531         Py_DECREF(v);
6532         return NULL;
6533     }
6534     return v;
6535
6536  onError:
6537     return NULL;
6538 }
6539
6540 PyDoc_STRVAR(decode__doc__,
6541 "S.decode([encoding[,errors]]) -> string or unicode\n\
6542 \n\
6543 Decodes S using the codec registered for encoding. encoding defaults\n\
6544 to the default encoding. errors may be given to set a different error\n\
6545 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6546 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6547 as well as any other name registerd with codecs.register_error that is\n\
6548 able to handle UnicodeDecodeErrors.");
6549
6550 static PyObject *
6551 unicode_decode(PyUnicodeObject *self, PyObject *args)
6552 {
6553     char *encoding = NULL;
6554     char *errors = NULL;
6555     PyObject *v;
6556
6557     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6558         return NULL;
6559     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6560     if (v == NULL)
6561         goto onError;
6562     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6563         PyErr_Format(PyExc_TypeError,
6564                      "decoder did not return a string/unicode object "
6565                      "(type=%.400s)",
6566                      Py_TYPE(v)->tp_name);
6567         Py_DECREF(v);
6568         return NULL;
6569     }
6570     return v;
6571
6572  onError:
6573     return NULL;
6574 }
6575
6576 PyDoc_STRVAR(expandtabs__doc__,
6577 "S.expandtabs([tabsize]) -> unicode\n\
6578 \n\
6579 Return a copy of S where all tab characters are expanded using spaces.\n\
6580 If tabsize is not given, a tab size of 8 characters is assumed.");
6581
6582 static PyObject*
6583 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6584 {
6585     Py_UNICODE *e;
6586     Py_UNICODE *p;
6587     Py_UNICODE *q;
6588     Py_UNICODE *qe;
6589     Py_ssize_t i, j, incr;
6590     PyUnicodeObject *u;
6591     int tabsize = 8;
6592
6593     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6594         return NULL;
6595
6596     /* First pass: determine size of output string */
6597     i = 0; /* chars up to and including most recent \n or \r */
6598     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6599     e = self->str + self->length; /* end of input */
6600     for (p = self->str; p < e; p++)
6601         if (*p == '\t') {
6602             if (tabsize > 0) {
6603                 incr = tabsize - (j % tabsize); /* cannot overflow */
6604                 if (j > PY_SSIZE_T_MAX - incr)
6605                     goto overflow1;
6606                 j += incr;
6607             }
6608         }
6609         else {
6610             if (j > PY_SSIZE_T_MAX - 1)
6611                 goto overflow1;
6612             j++;
6613             if (*p == '\n' || *p == '\r') {
6614                 if (i > PY_SSIZE_T_MAX - j)
6615                     goto overflow1;
6616                 i += j;
6617                 j = 0;
6618             }
6619         }
6620
6621     if (i > PY_SSIZE_T_MAX - j)
6622         goto overflow1;
6623
6624     /* Second pass: create output string and fill it */
6625     u = _PyUnicode_New(i + j);
6626     if (!u)
6627         return NULL;
6628
6629     j = 0; /* same as in first pass */
6630     q = u->str; /* next output char */
6631     qe = u->str + u->length; /* end of output */
6632
6633     for (p = self->str; p < e; p++)
6634         if (*p == '\t') {
6635             if (tabsize > 0) {
6636                 i = tabsize - (j % tabsize);
6637                 j += i;
6638                 while (i--) {
6639                     if (q >= qe)
6640                         goto overflow2;
6641                     *q++ = ' ';
6642                 }
6643             }
6644         }
6645         else {
6646             if (q >= qe)
6647                 goto overflow2;
6648             *q++ = *p;
6649             j++;
6650             if (*p == '\n' || *p == '\r')
6651                 j = 0;
6652         }
6653
6654     return (PyObject*) u;
6655
6656   overflow2:
6657     Py_DECREF(u);
6658   overflow1:
6659     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6660     return NULL;
6661 }
6662
6663 PyDoc_STRVAR(find__doc__,
6664 "S.find(sub [,start [,end]]) -> int\n\
6665 \n\
6666 Return the lowest index in S where substring sub is found,\n\
6667 such that sub is contained within s[start:end].  Optional\n\
6668 arguments start and end are interpreted as in slice notation.\n\
6669 \n\
6670 Return -1 on failure.");
6671
6672 static PyObject *
6673 unicode_find(PyUnicodeObject *self, PyObject *args)
6674 {
6675     PyObject *substring;
6676     Py_ssize_t start;
6677     Py_ssize_t end;
6678     Py_ssize_t result;
6679
6680     if (!_ParseTupleFinds(args, &substring, &start, &end))
6681         return NULL;
6682
6683     result = stringlib_find_slice(
6684         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6685         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6686         start, end
6687         );
6688
6689     Py_DECREF(substring);
6690
6691     return PyInt_FromSsize_t(result);
6692 }
6693
6694 static PyObject *
6695 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6696 {
6697     if (index < 0 || index >= self->length) {
6698         PyErr_SetString(PyExc_IndexError, "string index out of range");
6699         return NULL;
6700     }
6701
6702     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6703 }
6704
6705 static long
6706 unicode_hash(PyUnicodeObject *self)
6707 {
6708     /* Since Unicode objects compare equal to their ASCII string
6709        counterparts, they should use the individual character values
6710        as basis for their hash value.  This is needed to assure that
6711        strings and Unicode objects behave in the same way as
6712        dictionary keys. */
6713
6714     register Py_ssize_t len;
6715     register Py_UNICODE *p;
6716     register long x;
6717
6718     if (self->hash != -1)
6719         return self->hash;
6720     len = PyUnicode_GET_SIZE(self);
6721     p = PyUnicode_AS_UNICODE(self);
6722     x = *p << 7;
6723     while (--len >= 0)
6724         x = (1000003*x) ^ *p++;
6725     x ^= PyUnicode_GET_SIZE(self);
6726     if (x == -1)
6727         x = -2;
6728     self->hash = x;
6729     return x;
6730 }
6731
6732 PyDoc_STRVAR(index__doc__,
6733 "S.index(sub [,start [,end]]) -> int\n\
6734 \n\
6735 Like S.find() but raise ValueError when the substring is not found.");
6736
6737 static PyObject *
6738 unicode_index(PyUnicodeObject *self, PyObject *args)
6739 {
6740     Py_ssize_t result;
6741     PyObject *substring;
6742     Py_ssize_t start;
6743     Py_ssize_t end;
6744
6745     if (!_ParseTupleFinds(args, &substring, &start, &end))
6746         return NULL;
6747
6748     result = stringlib_find_slice(
6749         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6750         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6751         start, end
6752         );
6753
6754     Py_DECREF(substring);
6755
6756     if (result < 0) {
6757         PyErr_SetString(PyExc_ValueError, "substring not found");
6758         return NULL;
6759     }
6760
6761     return PyInt_FromSsize_t(result);
6762 }
6763
6764 PyDoc_STRVAR(islower__doc__,
6765 "S.islower() -> bool\n\
6766 \n\
6767 Return True if all cased characters in S are lowercase and there is\n\
6768 at least one cased character in S, False otherwise.");
6769
6770 static PyObject*
6771 unicode_islower(PyUnicodeObject *self)
6772 {
6773     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6774     register const Py_UNICODE *e;
6775     int cased;
6776
6777     /* Shortcut for single character strings */
6778     if (PyUnicode_GET_SIZE(self) == 1)
6779         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6780
6781     /* Special case for empty strings */
6782     if (PyUnicode_GET_SIZE(self) == 0)
6783         return PyBool_FromLong(0);
6784
6785     e = p + PyUnicode_GET_SIZE(self);
6786     cased = 0;
6787     for (; p < e; p++) {
6788         register const Py_UNICODE ch = *p;
6789
6790         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6791             return PyBool_FromLong(0);
6792         else if (!cased && Py_UNICODE_ISLOWER(ch))
6793             cased = 1;
6794     }
6795     return PyBool_FromLong(cased);
6796 }
6797
6798 PyDoc_STRVAR(isupper__doc__,
6799 "S.isupper() -> bool\n\
6800 \n\
6801 Return True if all cased characters in S are uppercase and there is\n\
6802 at least one cased character in S, False otherwise.");
6803
6804 static PyObject*
6805 unicode_isupper(PyUnicodeObject *self)
6806 {
6807     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6808     register const Py_UNICODE *e;
6809     int cased;
6810
6811     /* Shortcut for single character strings */
6812     if (PyUnicode_GET_SIZE(self) == 1)
6813         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6814
6815     /* Special case for empty strings */
6816     if (PyUnicode_GET_SIZE(self) == 0)
6817         return PyBool_FromLong(0);
6818
6819     e = p + PyUnicode_GET_SIZE(self);
6820     cased = 0;
6821     for (; p < e; p++) {
6822         register const Py_UNICODE ch = *p;
6823
6824         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6825             return PyBool_FromLong(0);
6826         else if (!cased && Py_UNICODE_ISUPPER(ch))
6827             cased = 1;
6828     }
6829     return PyBool_FromLong(cased);
6830 }
6831
6832 PyDoc_STRVAR(istitle__doc__,
6833 "S.istitle() -> bool\n\
6834 \n\
6835 Return True if S is a titlecased string and there is at least one\n\
6836 character in S, i.e. upper- and titlecase characters may only\n\
6837 follow uncased characters and lowercase characters only cased ones.\n\
6838 Return False otherwise.");
6839
6840 static PyObject*
6841 unicode_istitle(PyUnicodeObject *self)
6842 {
6843     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6844     register const Py_UNICODE *e;
6845     int cased, previous_is_cased;
6846
6847     /* Shortcut for single character strings */
6848     if (PyUnicode_GET_SIZE(self) == 1)
6849         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6850                                (Py_UNICODE_ISUPPER(*p) != 0));
6851
6852     /* Special case for empty strings */
6853     if (PyUnicode_GET_SIZE(self) == 0)
6854         return PyBool_FromLong(0);
6855
6856     e = p + PyUnicode_GET_SIZE(self);
6857     cased = 0;
6858     previous_is_cased = 0;
6859     for (; p < e; p++) {
6860         register const Py_UNICODE ch = *p;
6861
6862         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6863             if (previous_is_cased)
6864                 return PyBool_FromLong(0);
6865             previous_is_cased = 1;
6866             cased = 1;
6867         }
6868         else if (Py_UNICODE_ISLOWER(ch)) {
6869             if (!previous_is_cased)
6870                 return PyBool_FromLong(0);
6871             previous_is_cased = 1;
6872             cased = 1;
6873         }
6874         else
6875             previous_is_cased = 0;
6876     }
6877     return PyBool_FromLong(cased);
6878 }
6879
6880 PyDoc_STRVAR(isspace__doc__,
6881 "S.isspace() -> bool\n\
6882 \n\
6883 Return True if all characters in S are whitespace\n\
6884 and there is at least one character in S, False otherwise.");
6885
6886 static PyObject*
6887 unicode_isspace(PyUnicodeObject *self)
6888 {
6889     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890     register const Py_UNICODE *e;
6891
6892     /* Shortcut for single character strings */
6893     if (PyUnicode_GET_SIZE(self) == 1 &&
6894         Py_UNICODE_ISSPACE(*p))
6895         return PyBool_FromLong(1);
6896
6897     /* Special case for empty strings */
6898     if (PyUnicode_GET_SIZE(self) == 0)
6899         return PyBool_FromLong(0);
6900
6901     e = p + PyUnicode_GET_SIZE(self);
6902     for (; p < e; p++) {
6903         if (!Py_UNICODE_ISSPACE(*p))
6904             return PyBool_FromLong(0);
6905     }
6906     return PyBool_FromLong(1);
6907 }
6908
6909 PyDoc_STRVAR(isalpha__doc__,
6910 "S.isalpha() -> bool\n\
6911 \n\
6912 Return True if all characters in S are alphabetic\n\
6913 and there is at least one character in S, False otherwise.");
6914
6915 static PyObject*
6916 unicode_isalpha(PyUnicodeObject *self)
6917 {
6918     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919     register const Py_UNICODE *e;
6920
6921     /* Shortcut for single character strings */
6922     if (PyUnicode_GET_SIZE(self) == 1 &&
6923         Py_UNICODE_ISALPHA(*p))
6924         return PyBool_FromLong(1);
6925
6926     /* Special case for empty strings */
6927     if (PyUnicode_GET_SIZE(self) == 0)
6928         return PyBool_FromLong(0);
6929
6930     e = p + PyUnicode_GET_SIZE(self);
6931     for (; p < e; p++) {
6932         if (!Py_UNICODE_ISALPHA(*p))
6933             return PyBool_FromLong(0);
6934     }
6935     return PyBool_FromLong(1);
6936 }
6937
6938 PyDoc_STRVAR(isalnum__doc__,
6939 "S.isalnum() -> bool\n\
6940 \n\
6941 Return True if all characters in S are alphanumeric\n\
6942 and there is at least one character in S, False otherwise.");
6943
6944 static PyObject*
6945 unicode_isalnum(PyUnicodeObject *self)
6946 {
6947     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948     register const Py_UNICODE *e;
6949
6950     /* Shortcut for single character strings */
6951     if (PyUnicode_GET_SIZE(self) == 1 &&
6952         Py_UNICODE_ISALNUM(*p))
6953         return PyBool_FromLong(1);
6954
6955     /* Special case for empty strings */
6956     if (PyUnicode_GET_SIZE(self) == 0)
6957         return PyBool_FromLong(0);
6958
6959     e = p + PyUnicode_GET_SIZE(self);
6960     for (; p < e; p++) {
6961         if (!Py_UNICODE_ISALNUM(*p))
6962             return PyBool_FromLong(0);
6963     }
6964     return PyBool_FromLong(1);
6965 }
6966
6967 PyDoc_STRVAR(isdecimal__doc__,
6968 "S.isdecimal() -> bool\n\
6969 \n\
6970 Return True if there are only decimal characters in S,\n\
6971 False otherwise.");
6972
6973 static PyObject*
6974 unicode_isdecimal(PyUnicodeObject *self)
6975 {
6976     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977     register const Py_UNICODE *e;
6978
6979     /* Shortcut for single character strings */
6980     if (PyUnicode_GET_SIZE(self) == 1 &&
6981         Py_UNICODE_ISDECIMAL(*p))
6982         return PyBool_FromLong(1);
6983
6984     /* Special case for empty strings */
6985     if (PyUnicode_GET_SIZE(self) == 0)
6986         return PyBool_FromLong(0);
6987
6988     e = p + PyUnicode_GET_SIZE(self);
6989     for (; p < e; p++) {
6990         if (!Py_UNICODE_ISDECIMAL(*p))
6991             return PyBool_FromLong(0);
6992     }
6993     return PyBool_FromLong(1);
6994 }
6995
6996 PyDoc_STRVAR(isdigit__doc__,
6997 "S.isdigit() -> bool\n\
6998 \n\
6999 Return True if all characters in S are digits\n\
7000 and there is at least one character in S, False otherwise.");
7001
7002 static PyObject*
7003 unicode_isdigit(PyUnicodeObject *self)
7004 {
7005     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006     register const Py_UNICODE *e;
7007
7008     /* Shortcut for single character strings */
7009     if (PyUnicode_GET_SIZE(self) == 1 &&
7010         Py_UNICODE_ISDIGIT(*p))
7011         return PyBool_FromLong(1);
7012
7013     /* Special case for empty strings */
7014     if (PyUnicode_GET_SIZE(self) == 0)
7015         return PyBool_FromLong(0);
7016
7017     e = p + PyUnicode_GET_SIZE(self);
7018     for (; p < e; p++) {
7019         if (!Py_UNICODE_ISDIGIT(*p))
7020             return PyBool_FromLong(0);
7021     }
7022     return PyBool_FromLong(1);
7023 }
7024
7025 PyDoc_STRVAR(isnumeric__doc__,
7026 "S.isnumeric() -> bool\n\
7027 \n\
7028 Return True if there are only numeric characters in S,\n\
7029 False otherwise.");
7030
7031 static PyObject*
7032 unicode_isnumeric(PyUnicodeObject *self)
7033 {
7034     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035     register const Py_UNICODE *e;
7036
7037     /* Shortcut for single character strings */
7038     if (PyUnicode_GET_SIZE(self) == 1 &&
7039         Py_UNICODE_ISNUMERIC(*p))
7040         return PyBool_FromLong(1);
7041
7042     /* Special case for empty strings */
7043     if (PyUnicode_GET_SIZE(self) == 0)
7044         return PyBool_FromLong(0);
7045
7046     e = p + PyUnicode_GET_SIZE(self);
7047     for (; p < e; p++) {
7048         if (!Py_UNICODE_ISNUMERIC(*p))
7049             return PyBool_FromLong(0);
7050     }
7051     return PyBool_FromLong(1);
7052 }
7053
7054 PyDoc_STRVAR(join__doc__,
7055 "S.join(sequence) -> unicode\n\
7056 \n\
7057 Return a string which is the concatenation of the strings in the\n\
7058 sequence.  The separator between elements is S.");
7059
7060 static PyObject*
7061 unicode_join(PyObject *self, PyObject *data)
7062 {
7063     return PyUnicode_Join(self, data);
7064 }
7065
7066 static Py_ssize_t
7067 unicode_length(PyUnicodeObject *self)
7068 {
7069     return self->length;
7070 }
7071
7072 PyDoc_STRVAR(ljust__doc__,
7073 "S.ljust(width[, fillchar]) -> int\n\
7074 \n\
7075 Return S left-justified in a Unicode string of length width. Padding is\n\
7076 done using the specified fill character (default is a space).");
7077
7078 static PyObject *
7079 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7080 {
7081     Py_ssize_t width;
7082     Py_UNICODE fillchar = ' ';
7083
7084     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7085         return NULL;
7086
7087     if (self->length >= width && PyUnicode_CheckExact(self)) {
7088         Py_INCREF(self);
7089         return (PyObject*) self;
7090     }
7091
7092     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7093 }
7094
7095 PyDoc_STRVAR(lower__doc__,
7096 "S.lower() -> unicode\n\
7097 \n\
7098 Return a copy of the string S converted to lowercase.");
7099
7100 static PyObject*
7101 unicode_lower(PyUnicodeObject *self)
7102 {
7103     return fixup(self, fixlower);
7104 }
7105
7106 #define LEFTSTRIP 0
7107 #define RIGHTSTRIP 1
7108 #define BOTHSTRIP 2
7109
7110 /* Arrays indexed by above */
7111 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7112
7113 #define STRIPNAME(i) (stripformat[i]+3)
7114
7115 /* externally visible for str.strip(unicode) */
7116 PyObject *
7117 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7118 {
7119         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7120         Py_ssize_t len = PyUnicode_GET_SIZE(self);
7121         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7122         Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7123         Py_ssize_t i, j;
7124
7125         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7126
7127         i = 0;
7128         if (striptype != RIGHTSTRIP) {
7129             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7130                 i++;
7131             }
7132         }
7133
7134         j = len;
7135         if (striptype != LEFTSTRIP) {
7136             do {
7137                 j--;
7138             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7139             j++;
7140         }
7141
7142         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7143             Py_INCREF(self);
7144             return (PyObject*)self;
7145         }
7146         else
7147             return PyUnicode_FromUnicode(s+i, j-i);
7148 }
7149
7150
7151 static PyObject *
7152 do_strip(PyUnicodeObject *self, int striptype)
7153 {
7154         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7155         Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7156
7157         i = 0;
7158         if (striptype != RIGHTSTRIP) {
7159                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7160                         i++;
7161                 }
7162         }
7163
7164         j = len;
7165         if (striptype != LEFTSTRIP) {
7166                 do {
7167                         j--;
7168                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7169                 j++;
7170         }
7171
7172         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7173                 Py_INCREF(self);
7174                 return (PyObject*)self;
7175         }
7176         else
7177                 return PyUnicode_FromUnicode(s+i, j-i);
7178 }
7179
7180
7181 static PyObject *
7182 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7183 {
7184         PyObject *sep = NULL;
7185
7186         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7187                 return NULL;
7188
7189         if (sep != NULL && sep != Py_None) {
7190                 if (PyUnicode_Check(sep))
7191                         return _PyUnicode_XStrip(self, striptype, sep);
7192                 else if (PyString_Check(sep)) {
7193                         PyObject *res;
7194                         sep = PyUnicode_FromObject(sep);
7195                         if (sep==NULL)
7196                                 return NULL;
7197                         res = _PyUnicode_XStrip(self, striptype, sep);
7198                         Py_DECREF(sep);
7199                         return res;
7200                 }
7201                 else {
7202                         PyErr_Format(PyExc_TypeError,
7203                                      "%s arg must be None, unicode or str",
7204                                      STRIPNAME(striptype));
7205                         return NULL;
7206                 }
7207         }
7208
7209         return do_strip(self, striptype);
7210 }
7211
7212
7213 PyDoc_STRVAR(strip__doc__,
7214 "S.strip([chars]) -> unicode\n\
7215 \n\
7216 Return a copy of the string S with leading and trailing\n\
7217 whitespace removed.\n\
7218 If chars is given and not None, remove characters in chars instead.\n\
7219 If chars is a str, it will be converted to unicode before stripping");
7220
7221 static PyObject *
7222 unicode_strip(PyUnicodeObject *self, PyObject *args)
7223 {
7224         if (PyTuple_GET_SIZE(args) == 0)
7225                 return do_strip(self, BOTHSTRIP); /* Common case */
7226         else
7227                 return do_argstrip(self, BOTHSTRIP, args);
7228 }
7229
7230
7231 PyDoc_STRVAR(lstrip__doc__,
7232 "S.lstrip([chars]) -> unicode\n\
7233 \n\
7234 Return a copy of the string S with leading whitespace removed.\n\
7235 If chars is given and not None, remove characters in chars instead.\n\
7236 If chars is a str, it will be converted to unicode before stripping");
7237
7238 static PyObject *
7239 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7240 {
7241         if (PyTuple_GET_SIZE(args) == 0)
7242                 return do_strip(self, LEFTSTRIP); /* Common case */
7243         else
7244                 return do_argstrip(self, LEFTSTRIP, args);
7245 }
7246
7247
7248 PyDoc_STRVAR(rstrip__doc__,
7249 "S.rstrip([chars]) -> unicode\n\
7250 \n\
7251 Return a copy of the string S with trailing whitespace removed.\n\
7252 If chars is given and not None, remove characters in chars instead.\n\
7253 If chars is a str, it will be converted to unicode before stripping");
7254
7255 static PyObject *
7256 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7257 {
7258         if (PyTuple_GET_SIZE(args) == 0)
7259                 return do_strip(self, RIGHTSTRIP); /* Common case */
7260         else
7261                 return do_argstrip(self, RIGHTSTRIP, args);
7262 }
7263
7264
7265 static PyObject*
7266 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7267 {
7268     PyUnicodeObject *u;
7269     Py_UNICODE *p;
7270     Py_ssize_t nchars;
7271     size_t nbytes;
7272
7273     if (len < 0)
7274         len = 0;
7275
7276     if (len == 1 && PyUnicode_CheckExact(str)) {
7277         /* no repeat, return original string */
7278         Py_INCREF(str);
7279         return (PyObject*) str;
7280     }
7281
7282     /* ensure # of chars needed doesn't overflow int and # of bytes
7283      * needed doesn't overflow size_t
7284      */
7285     nchars = len * str->length;
7286     if (len && nchars / len != str->length) {
7287         PyErr_SetString(PyExc_OverflowError,
7288                         "repeated string is too long");
7289         return NULL;
7290     }
7291     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7292     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7293         PyErr_SetString(PyExc_OverflowError,
7294                         "repeated string is too long");
7295         return NULL;
7296     }
7297     u = _PyUnicode_New(nchars);
7298     if (!u)
7299         return NULL;
7300
7301     p = u->str;
7302
7303     if (str->length == 1 && len > 0) {
7304         Py_UNICODE_FILL(p, str->str[0], len);
7305     } else {
7306         Py_ssize_t done = 0; /* number of characters copied this far */
7307         if (done < nchars) {
7308             Py_UNICODE_COPY(p, str->str, str->length);
7309             done = str->length;
7310         }
7311         while (done < nchars) {
7312             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7313             Py_UNICODE_COPY(p+done, p, n);
7314             done += n;
7315         }
7316     }
7317
7318     return (PyObject*) u;
7319 }
7320
7321 PyObject *PyUnicode_Replace(PyObject *obj,
7322                             PyObject *subobj,
7323                             PyObject *replobj,
7324                             Py_ssize_t maxcount)
7325 {
7326     PyObject *self;
7327     PyObject *str1;
7328     PyObject *str2;
7329     PyObject *result;
7330
7331     self = PyUnicode_FromObject(obj);
7332     if (self == NULL)
7333         return NULL;
7334     str1 = PyUnicode_FromObject(subobj);
7335     if (str1 == NULL) {
7336         Py_DECREF(self);
7337         return NULL;
7338     }
7339     str2 = PyUnicode_FromObject(replobj);
7340     if (str2 == NULL) {
7341         Py_DECREF(self);
7342         Py_DECREF(str1);
7343         return NULL;
7344     }
7345     result = replace((PyUnicodeObject *)self,
7346                      (PyUnicodeObject *)str1,
7347                      (PyUnicodeObject *)str2,
7348                      maxcount);
7349     Py_DECREF(self);
7350     Py_DECREF(str1);
7351     Py_DECREF(str2);
7352     return result;
7353 }
7354
7355 PyDoc_STRVAR(replace__doc__,
7356 "S.replace (old, new[, count]) -> unicode\n\
7357 \n\
7358 Return a copy of S with all occurrences of substring\n\
7359 old replaced by new.  If the optional argument count is\n\
7360 given, only the first count occurrences are replaced.");
7361
7362 static PyObject*
7363 unicode_replace(PyUnicodeObject *self, PyObject *args)
7364 {
7365     PyUnicodeObject *str1;
7366     PyUnicodeObject *str2;
7367     Py_ssize_t maxcount = -1;
7368     PyObject *result;
7369
7370     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7371         return NULL;
7372     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7373     if (str1 == NULL)
7374         return NULL;
7375     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7376     if (str2 == NULL) {
7377         Py_DECREF(str1);
7378         return NULL;
7379     }
7380
7381     result = replace(self, str1, str2, maxcount);
7382
7383     Py_DECREF(str1);
7384     Py_DECREF(str2);
7385     return result;
7386 }
7387
7388 static
7389 PyObject *unicode_repr(PyObject *unicode)
7390 {
7391     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7392                                 PyUnicode_GET_SIZE(unicode),
7393                                 1);
7394 }
7395
7396 PyDoc_STRVAR(rfind__doc__,
7397 "S.rfind(sub [,start [,end]]) -> int\n\
7398 \n\
7399 Return the highest index in S where substring sub is found,\n\
7400 such that sub is contained within s[start:end].  Optional\n\
7401 arguments start and end are interpreted as in slice notation.\n\
7402 \n\
7403 Return -1 on failure.");
7404
7405 static PyObject *
7406 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7407 {
7408     PyObject *substring;
7409     Py_ssize_t start;
7410     Py_ssize_t end;
7411     Py_ssize_t result;
7412
7413     if (!_ParseTupleFinds(args, &substring, &start, &end))
7414             return NULL;
7415
7416     result = stringlib_rfind_slice(
7417         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7418         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7419         start, end
7420         );
7421
7422     Py_DECREF(substring);
7423
7424     return PyInt_FromSsize_t(result);
7425 }
7426
7427 PyDoc_STRVAR(rindex__doc__,
7428 "S.rindex(sub [,start [,end]]) -> int\n\
7429 \n\
7430 Like S.rfind() but raise ValueError when the substring is not found.");
7431
7432 static PyObject *
7433 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7434 {
7435     PyObject *substring;
7436     Py_ssize_t start;
7437     Py_ssize_t end;
7438     Py_ssize_t result;
7439
7440     if (!_ParseTupleFinds(args, &substring, &start, &end))
7441             return NULL;
7442
7443     result = stringlib_rfind_slice(
7444         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7445         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7446         start, end
7447         );
7448
7449     Py_DECREF(substring);
7450
7451     if (result < 0) {
7452         PyErr_SetString(PyExc_ValueError, "substring not found");
7453         return NULL;
7454     }
7455     return PyInt_FromSsize_t(result);
7456 }
7457
7458 PyDoc_STRVAR(rjust__doc__,
7459 "S.rjust(width[, fillchar]) -> unicode\n\
7460 \n\
7461 Return S right-justified in a Unicode string of length width. Padding is\n\
7462 done using the specified fill character (default is a space).");
7463
7464 static PyObject *
7465 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7466 {
7467     Py_ssize_t width;
7468     Py_UNICODE fillchar = ' ';
7469
7470     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7471         return NULL;
7472
7473     if (self->length >= width && PyUnicode_CheckExact(self)) {
7474         Py_INCREF(self);
7475         return (PyObject*) self;
7476     }
7477
7478     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7479 }
7480
7481 static PyObject*
7482 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7483 {
7484     /* standard clamping */
7485     if (start < 0)
7486         start = 0;
7487     if (end < 0)
7488         end = 0;
7489     if (end > self->length)
7490         end = self->length;
7491     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7492         /* full slice, return original string */
7493         Py_INCREF(self);
7494         return (PyObject*) self;
7495     }
7496     if (start > end)
7497         start = end;
7498     /* copy slice */
7499     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7500                                              end - start);
7501 }
7502
7503 PyObject *PyUnicode_Split(PyObject *s,
7504                           PyObject *sep,
7505                           Py_ssize_t maxsplit)
7506 {
7507     PyObject *result;
7508
7509     s = PyUnicode_FromObject(s);
7510     if (s == NULL)
7511         return NULL;
7512     if (sep != NULL) {
7513         sep = PyUnicode_FromObject(sep);
7514         if (sep == NULL) {
7515             Py_DECREF(s);
7516             return NULL;
7517         }
7518     }
7519
7520     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7521
7522     Py_DECREF(s);
7523     Py_XDECREF(sep);
7524     return result;
7525 }
7526
7527 PyDoc_STRVAR(split__doc__,
7528 "S.split([sep [,maxsplit]]) -> list of strings\n\
7529 \n\
7530 Return a list of the words in S, using sep as the\n\
7531 delimiter string.  If maxsplit is given, at most maxsplit\n\
7532 splits are done. If sep is not specified or is None, any\n\
7533 whitespace string is a separator and empty strings are\n\
7534 removed from the result.");
7535
7536 static PyObject*
7537 unicode_split(PyUnicodeObject *self, PyObject *args)
7538 {
7539     PyObject *substring = Py_None;
7540     Py_ssize_t maxcount = -1;
7541
7542     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7543         return NULL;
7544
7545     if (substring == Py_None)
7546         return split(self, NULL, maxcount);
7547     else if (PyUnicode_Check(substring))
7548         return split(self, (PyUnicodeObject *)substring, maxcount);
7549     else
7550         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7551 }
7552
7553 PyObject *
7554 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7555 {
7556     PyObject* str_obj;
7557     PyObject* sep_obj;
7558     PyObject* out;
7559
7560     str_obj = PyUnicode_FromObject(str_in);
7561     if (!str_obj)
7562         return NULL;
7563     sep_obj = PyUnicode_FromObject(sep_in);
7564     if (!sep_obj) {
7565         Py_DECREF(str_obj);
7566         return NULL;
7567     }
7568
7569     out = stringlib_partition(
7570         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7571         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7572         );
7573
7574     Py_DECREF(sep_obj);
7575     Py_DECREF(str_obj);
7576
7577     return out;
7578 }
7579
7580
7581 PyObject *
7582 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7583 {
7584     PyObject* str_obj;
7585     PyObject* sep_obj;
7586     PyObject* out;
7587
7588     str_obj = PyUnicode_FromObject(str_in);
7589     if (!str_obj)
7590         return NULL;
7591     sep_obj = PyUnicode_FromObject(sep_in);
7592     if (!sep_obj) {
7593         Py_DECREF(str_obj);
7594         return NULL;
7595     }
7596
7597     out = stringlib_rpartition(
7598         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7599         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7600         );
7601
7602     Py_DECREF(sep_obj);
7603     Py_DECREF(str_obj);
7604
7605     return out;
7606 }
7607
7608 PyDoc_STRVAR(partition__doc__,
7609 "S.partition(sep) -> (head, sep, tail)\n\
7610 \n\
7611 Search for the separator sep in S, and return the part before it,\n\
7612 the separator itself, and the part after it.  If the separator is not\n\
7613 found, return S and two empty strings.");
7614
7615 static PyObject*
7616 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7617 {
7618     return PyUnicode_Partition((PyObject *)self, separator);
7619 }
7620
7621 PyDoc_STRVAR(rpartition__doc__,
7622 "S.rpartition(sep) -> (tail, sep, head)\n\
7623 \n\
7624 Search for the separator sep in S, starting at the end of S, and return\n\
7625 the part before it, the separator itself, and the part after it.  If the\n\
7626 separator is not found, return two empty strings and S.");
7627
7628 static PyObject*
7629 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7630 {
7631     return PyUnicode_RPartition((PyObject *)self, separator);
7632 }
7633
7634 PyObject *PyUnicode_RSplit(PyObject *s,
7635                            PyObject *sep,
7636                            Py_ssize_t maxsplit)
7637 {
7638     PyObject *result;
7639
7640     s = PyUnicode_FromObject(s);
7641     if (s == NULL)
7642         return NULL;
7643     if (sep != NULL) {
7644         sep = PyUnicode_FromObject(sep);
7645         if (sep == NULL) {
7646             Py_DECREF(s);
7647             return NULL;
7648         }
7649     }
7650
7651     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7652
7653     Py_DECREF(s);
7654     Py_XDECREF(sep);
7655     return result;
7656 }
7657
7658 PyDoc_STRVAR(rsplit__doc__,
7659 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7660 \n\
7661 Return a list of the words in S, using sep as the\n\
7662 delimiter string, starting at the end of the string and\n\
7663 working to the front.  If maxsplit is given, at most maxsplit\n\
7664 splits are done. If sep is not specified, any whitespace string\n\
7665 is a separator.");
7666
7667 static PyObject*
7668 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7669 {
7670     PyObject *substring = Py_None;
7671     Py_ssize_t maxcount = -1;
7672
7673     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7674         return NULL;
7675
7676     if (substring == Py_None)
7677         return rsplit(self, NULL, maxcount);
7678     else if (PyUnicode_Check(substring))
7679         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7680     else
7681         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7682 }
7683
7684 PyDoc_STRVAR(splitlines__doc__,
7685 "S.splitlines([keepends]) -> list of strings\n\
7686 \n\
7687 Return a list of the lines in S, breaking at line boundaries.\n\
7688 Line breaks are not included in the resulting list unless keepends\n\
7689 is given and true.");
7690
7691 static PyObject*
7692 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7693 {
7694     int keepends = 0;
7695
7696     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7697         return NULL;
7698
7699     return PyUnicode_Splitlines((PyObject *)self, keepends);
7700 }
7701
7702 static
7703 PyObject *unicode_str(PyUnicodeObject *self)
7704 {
7705     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7706 }
7707
7708 PyDoc_STRVAR(swapcase__doc__,
7709 "S.swapcase() -> unicode\n\
7710 \n\
7711 Return a copy of S with uppercase characters converted to lowercase\n\
7712 and vice versa.");
7713
7714 static PyObject*
7715 unicode_swapcase(PyUnicodeObject *self)
7716 {
7717     return fixup(self, fixswapcase);
7718 }
7719
7720 PyDoc_STRVAR(translate__doc__,
7721 "S.translate(table) -> unicode\n\
7722 \n\
7723 Return a copy of the string S, where all characters have been mapped\n\
7724 through the given translation table, which must be a mapping of\n\
7725 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7726 Unmapped characters are left untouched. Characters mapped to None\n\
7727 are deleted.");
7728
7729 static PyObject*
7730 unicode_translate(PyUnicodeObject *self, PyObject *table)
7731 {
7732     return PyUnicode_TranslateCharmap(self->str,
7733                                       self->length,
7734                                       table,
7735                                       "ignore");
7736 }
7737
7738 PyDoc_STRVAR(upper__doc__,
7739 "S.upper() -> unicode\n\
7740 \n\
7741 Return a copy of S converted to uppercase.");
7742
7743 static PyObject*
7744 unicode_upper(PyUnicodeObject *self)
7745 {
7746     return fixup(self, fixupper);
7747 }
7748
7749 PyDoc_STRVAR(zfill__doc__,
7750 "S.zfill(width) -> unicode\n\
7751 \n\
7752 Pad a numeric string S with zeros on the left, to fill a field\n\
7753 of the specified width. The string S is never truncated.");
7754
7755 static PyObject *
7756 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7757 {
7758     Py_ssize_t fill;
7759     PyUnicodeObject *u;
7760
7761     Py_ssize_t width;
7762     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7763         return NULL;
7764
7765     if (self->length >= width) {
7766         if (PyUnicode_CheckExact(self)) {
7767             Py_INCREF(self);
7768             return (PyObject*) self;
7769         }
7770         else
7771             return PyUnicode_FromUnicode(
7772                 PyUnicode_AS_UNICODE(self),
7773                 PyUnicode_GET_SIZE(self)
7774             );
7775     }
7776
7777     fill = width - self->length;
7778
7779     u = pad(self, fill, 0, '0');
7780
7781     if (u == NULL)
7782         return NULL;
7783
7784     if (u->str[fill] == '+' || u->str[fill] == '-') {
7785         /* move sign to beginning of string */
7786         u->str[0] = u->str[fill];
7787         u->str[fill] = '0';
7788     }
7789
7790     return (PyObject*) u;
7791 }
7792
7793 #if 0
7794 static PyObject*
7795 free_listsize(PyUnicodeObject *self)
7796 {
7797     return PyInt_FromLong(numfree);
7798 }
7799 #endif
7800
7801 PyDoc_STRVAR(startswith__doc__,
7802 "S.startswith(prefix[, start[, end]]) -> bool\n\
7803 \n\
7804 Return True if S starts with the specified prefix, False otherwise.\n\
7805 With optional start, test S beginning at that position.\n\
7806 With optional end, stop comparing S at that position.\n\
7807 prefix can also be a tuple of strings to try.");
7808
7809 static PyObject *
7810 unicode_startswith(PyUnicodeObject *self,
7811                    PyObject *args)
7812 {
7813     PyObject *subobj;
7814     PyUnicodeObject *substring;
7815     Py_ssize_t start = 0;
7816     Py_ssize_t end = PY_SSIZE_T_MAX;
7817     int result;
7818
7819     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7820                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7821         return NULL;
7822     if (PyTuple_Check(subobj)) {
7823         Py_ssize_t i;
7824         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7825             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7826                             PyTuple_GET_ITEM(subobj, i));
7827             if (substring == NULL)
7828                 return NULL;
7829             result = tailmatch(self, substring, start, end, -1);
7830             Py_DECREF(substring);
7831             if (result) {
7832                 Py_RETURN_TRUE;
7833             }
7834         }
7835         /* nothing matched */
7836         Py_RETURN_FALSE;
7837     }
7838     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7839     if (substring == NULL)
7840          return NULL;
7841     result = tailmatch(self, substring, start, end, -1);
7842     Py_DECREF(substring);
7843     return PyBool_FromLong(result);
7844 }
7845
7846
7847 PyDoc_STRVAR(endswith__doc__,
7848 "S.endswith(suffix[, start[, end]]) -> bool\n\
7849 \n\
7850 Return True if S ends with the specified suffix, False otherwise.\n\
7851 With optional start, test S beginning at that position.\n\
7852 With optional end, stop comparing S at that position.\n\
7853 suffix can also be a tuple of strings to try.");
7854
7855 static PyObject *
7856 unicode_endswith(PyUnicodeObject *self,
7857                  PyObject *args)
7858 {
7859     PyObject *subobj;
7860     PyUnicodeObject *substring;
7861     Py_ssize_t start = 0;
7862     Py_ssize_t end = PY_SSIZE_T_MAX;
7863     int result;
7864
7865     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7866         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7867         return NULL;
7868     if (PyTuple_Check(subobj)) {
7869         Py_ssize_t i;
7870         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7871             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7872                             PyTuple_GET_ITEM(subobj, i));
7873             if (substring == NULL)
7874             return NULL;
7875             result = tailmatch(self, substring, start, end, +1);
7876             Py_DECREF(substring);
7877             if (result) {
7878                 Py_RETURN_TRUE;
7879             }
7880         }
7881         Py_RETURN_FALSE;
7882     }
7883     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7884     if (substring == NULL)
7885     return NULL;
7886
7887     result = tailmatch(self, substring, start, end, +1);
7888     Py_DECREF(substring);
7889     return PyBool_FromLong(result);
7890 }
7891
7892
7893 /* Implements do_string_format, which is unicode because of stringlib */
7894 #include "stringlib/string_format.h"
7895
7896 PyDoc_STRVAR(format__doc__,
7897 "S.format(*args, **kwargs) -> unicode\n\
7898 \n\
7899 ");
7900
7901 static PyObject *
7902 unicode__format__(PyObject *self, PyObject *args)
7903 {
7904     PyObject *format_spec;
7905     PyObject *result = NULL;
7906     PyObject *tmp = NULL;
7907
7908     /* If 2.x, convert format_spec to the same type as value */
7909     /* This is to allow things like u''.format('') */
7910     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7911         goto done;
7912     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7913         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7914                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7915         goto done;
7916     }
7917     tmp = PyObject_Unicode(format_spec);
7918     if (tmp == NULL)
7919         goto done;
7920     format_spec = tmp;
7921
7922     result = _PyUnicode_FormatAdvanced(self,
7923                                        PyUnicode_AS_UNICODE(format_spec),
7924                                        PyUnicode_GET_SIZE(format_spec));
7925 done:
7926     Py_XDECREF(tmp);
7927     return result;
7928 }
7929
7930 PyDoc_STRVAR(p_format__doc__,
7931 "S.__format__(format_spec) -> unicode\n\
7932 \n\
7933 ");
7934
7935 static PyObject *
7936 unicode__sizeof__(PyUnicodeObject *v)
7937 {
7938     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7939                              sizeof(Py_UNICODE) * (v->length + 1));
7940 }
7941
7942 PyDoc_STRVAR(sizeof__doc__,
7943 "S.__sizeof__() -> size of S in memory, in bytes\n\
7944 \n\
7945 ");
7946
7947 static PyObject *
7948 unicode_getnewargs(PyUnicodeObject *v)
7949 {
7950         return Py_BuildValue("(u#)", v->str, v->length);
7951 }
7952
7953
7954 static PyMethodDef unicode_methods[] = {
7955
7956     /* Order is according to common usage: often used methods should
7957        appear first, since lookup is done sequentially. */
7958
7959     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7960     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7961     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7962     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7963     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7964     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7965     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7966     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7967     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7968     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7969     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7970     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7971     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7972     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7973     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7974     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7975     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7976 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7977     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7978     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7979     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7980     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7981     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7982     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7983     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7984     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7985     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7986     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7987     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7988     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7989     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7990     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7991     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7992     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7993     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7994     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7995     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7996     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7997     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7998     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7999     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8000     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8001     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8002     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8003     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8004 #if 0
8005     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8006 #endif
8007
8008 #if 0
8009     /* This one is just used for debugging the implementation. */
8010     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8011 #endif
8012
8013     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8014     {NULL, NULL}
8015 };
8016
8017 static PyObject *
8018 unicode_mod(PyObject *v, PyObject *w)
8019 {
8020        if (!PyUnicode_Check(v)) {
8021                Py_INCREF(Py_NotImplemented);
8022                return Py_NotImplemented;
8023        }
8024        return PyUnicode_Format(v, w);
8025 }
8026
8027 static PyNumberMethods unicode_as_number = {
8028         0,                              /*nb_add*/
8029         0,                              /*nb_subtract*/
8030         0,                              /*nb_multiply*/
8031         0,                              /*nb_divide*/
8032         unicode_mod,                    /*nb_remainder*/
8033 };
8034
8035 static PySequenceMethods unicode_as_sequence = {
8036     (lenfunc) unicode_length,           /* sq_length */
8037     PyUnicode_Concat,                   /* sq_concat */
8038     (ssizeargfunc) unicode_repeat,      /* sq_repeat */
8039     (ssizeargfunc) unicode_getitem,     /* sq_item */
8040     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8041     0,                                  /* sq_ass_item */
8042     0,                                  /* sq_ass_slice */
8043     PyUnicode_Contains,                 /* sq_contains */
8044 };
8045
8046 static PyObject*
8047 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8048 {
8049     if (PyIndex_Check(item)) {
8050         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8051         if (i == -1 && PyErr_Occurred())
8052             return NULL;
8053         if (i < 0)
8054             i += PyUnicode_GET_SIZE(self);
8055         return unicode_getitem(self, i);
8056     } else if (PySlice_Check(item)) {
8057         Py_ssize_t start, stop, step, slicelength, cur, i;
8058         Py_UNICODE* source_buf;
8059         Py_UNICODE* result_buf;
8060         PyObject* result;
8061
8062         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8063                                  &start, &stop, &step, &slicelength) < 0) {
8064             return NULL;
8065         }
8066
8067         if (slicelength <= 0) {
8068             return PyUnicode_FromUnicode(NULL, 0);
8069         } else if (start == 0 && step == 1 && slicelength == self->length &&
8070                    PyUnicode_CheckExact(self)) {
8071             Py_INCREF(self);
8072             return (PyObject *)self;
8073         } else if (step == 1) {
8074             return PyUnicode_FromUnicode(self->str + start, slicelength);
8075         } else {
8076             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8077             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8078                                                        sizeof(Py_UNICODE));
8079
8080             if (result_buf == NULL)
8081                     return PyErr_NoMemory();
8082
8083             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8084                 result_buf[i] = source_buf[cur];
8085             }
8086
8087             result = PyUnicode_FromUnicode(result_buf, slicelength);
8088             PyObject_FREE(result_buf);
8089             return result;
8090         }
8091     } else {
8092         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8093         return NULL;
8094     }
8095 }
8096
8097 static PyMappingMethods unicode_as_mapping = {
8098     (lenfunc)unicode_length,            /* mp_length */
8099     (binaryfunc)unicode_subscript,      /* mp_subscript */
8100     (objobjargproc)0,                   /* mp_ass_subscript */
8101 };
8102
8103 static Py_ssize_t
8104 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8105                           Py_ssize_t index,
8106                           const void **ptr)
8107 {
8108     if (index != 0) {
8109         PyErr_SetString(PyExc_SystemError,
8110                         "accessing non-existent unicode segment");
8111         return -1;
8112     }
8113     *ptr = (void *) self->str;
8114     return PyUnicode_GET_DATA_SIZE(self);
8115 }
8116
8117 static Py_ssize_t
8118 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8119                            const void **ptr)
8120 {
8121     PyErr_SetString(PyExc_TypeError,
8122                     "cannot use unicode as modifiable buffer");
8123     return -1;
8124 }
8125
8126 static int
8127 unicode_buffer_getsegcount(PyUnicodeObject *self,
8128                            Py_ssize_t *lenp)
8129 {
8130     if (lenp)
8131         *lenp = PyUnicode_GET_DATA_SIZE(self);
8132     return 1;
8133 }
8134
8135 static Py_ssize_t
8136 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8137                           Py_ssize_t index,
8138                           const void **ptr)
8139 {
8140     PyObject *str;
8141
8142     if (index != 0) {
8143         PyErr_SetString(PyExc_SystemError,
8144                         "accessing non-existent unicode segment");
8145         return -1;
8146     }
8147     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8148     if (str == NULL)
8149         return -1;
8150     *ptr = (void *) PyString_AS_STRING(str);
8151     return PyString_GET_SIZE(str);
8152 }
8153
8154 /* Helpers for PyUnicode_Format() */
8155
8156 static PyObject *
8157 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8158 {
8159     Py_ssize_t argidx = *p_argidx;
8160     if (argidx < arglen) {
8161         (*p_argidx)++;
8162         if (arglen < 0)
8163             return args;
8164         else
8165             return PyTuple_GetItem(args, argidx);
8166     }
8167     PyErr_SetString(PyExc_TypeError,
8168                     "not enough arguments for format string");
8169     return NULL;
8170 }
8171
8172 #define F_LJUST (1<<0)
8173 #define F_SIGN  (1<<1)
8174 #define F_BLANK (1<<2)
8175 #define F_ALT   (1<<3)
8176 #define F_ZERO  (1<<4)
8177
8178 static Py_ssize_t
8179 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8180 {
8181     register Py_ssize_t i;
8182     Py_ssize_t len = strlen(charbuffer);
8183     for (i = len - 1; i >= 0; i--)
8184         buffer[i] = (Py_UNICODE) charbuffer[i];
8185
8186     return len;
8187 }
8188
8189 static int
8190 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8191 {
8192     Py_ssize_t result;
8193
8194     PyOS_ascii_formatd((char *)buffer, len, format, x);
8195     result = strtounicode(buffer, (char *)buffer);
8196     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8197 }
8198
8199 static int
8200 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8201 {
8202     Py_ssize_t result;
8203
8204     PyOS_snprintf((char *)buffer, len, format, x);
8205     result = strtounicode(buffer, (char *)buffer);
8206     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8207 }
8208
8209 /* XXX To save some code duplication, formatfloat/long/int could have been
8210    shared with stringobject.c, converting from 8-bit to Unicode after the
8211    formatting is done. */
8212
8213 static int
8214 formatfloat(Py_UNICODE *buf,
8215             size_t buflen,
8216             int flags,
8217             int prec,
8218             int type,
8219             PyObject *v)
8220 {
8221     /* fmt = '%#.' + `prec` + `type`
8222        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8223     char fmt[20];
8224     double x;
8225
8226     x = PyFloat_AsDouble(v);
8227     if (x == -1.0 && PyErr_Occurred())
8228         return -1;
8229     if (prec < 0)
8230         prec = 6;
8231     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8232         type = 'g';
8233     /* Worst case length calc to ensure no buffer overrun:
8234
8235        'g' formats:
8236          fmt = %#.<prec>g
8237          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8238             for any double rep.)
8239          len = 1 + prec + 1 + 2 + 5 = 9 + prec
8240
8241        'f' formats:
8242          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8243          len = 1 + 50 + 1 + prec = 52 + prec
8244
8245        If prec=0 the effective precision is 1 (the leading digit is
8246        always given), therefore increase the length by one.
8247
8248     */
8249     if (((type == 'g' || type == 'G') &&
8250           buflen <= (size_t)10 + (size_t)prec) ||
8251         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8252         PyErr_SetString(PyExc_OverflowError,
8253                         "formatted float is too long (precision too large?)");
8254         return -1;
8255     }
8256     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8257                   (flags&F_ALT) ? "#" : "",
8258                   prec, type);
8259     return doubletounicode(buf, buflen, fmt, x);
8260 }
8261
8262 static PyObject*
8263 formatlong(PyObject *val, int flags, int prec, int type)
8264 {
8265         char *buf;
8266         int i, len;
8267         PyObject *str; /* temporary string object. */
8268         PyUnicodeObject *result;
8269
8270         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8271         if (!str)
8272                 return NULL;
8273         result = _PyUnicode_New(len);
8274         if (!result) {
8275                 Py_DECREF(str);
8276                 return NULL;
8277         }
8278         for (i = 0; i < len; i++)
8279                 result->str[i] = buf[i];
8280         result->str[len] = 0;
8281         Py_DECREF(str);
8282         return (PyObject*)result;
8283 }
8284
8285 static int
8286 formatint(Py_UNICODE *buf,
8287           size_t buflen,
8288           int flags,
8289           int prec,
8290           int type,
8291           PyObject *v)
8292 {
8293     /* fmt = '%#.' + `prec` + 'l' + `type`
8294      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8295      *                     + 1 + 1
8296      *                   = 24
8297      */
8298     char fmt[64]; /* plenty big enough! */
8299     char *sign;
8300     long x;
8301
8302     x = PyInt_AsLong(v);
8303     if (x == -1 && PyErr_Occurred())
8304         return -1;
8305     if (x < 0 && type == 'u') {
8306         type = 'd';
8307     }
8308     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8309         sign = "-";
8310     else
8311         sign = "";
8312     if (prec < 0)
8313         prec = 1;
8314
8315     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8316      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8317      */
8318     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8319         PyErr_SetString(PyExc_OverflowError,
8320                 "formatted integer is too long (precision too large?)");
8321         return -1;
8322     }
8323
8324     if ((flags & F_ALT) &&
8325         (type == 'x' || type == 'X')) {
8326         /* When converting under %#x or %#X, there are a number
8327          * of issues that cause pain:
8328          * - when 0 is being converted, the C standard leaves off
8329          *   the '0x' or '0X', which is inconsistent with other
8330          *   %#x/%#X conversions and inconsistent with Python's
8331          *   hex() function
8332          * - there are platforms that violate the standard and
8333          *   convert 0 with the '0x' or '0X'
8334          *   (Metrowerks, Compaq Tru64)
8335          * - there are platforms that give '0x' when converting
8336          *   under %#X, but convert 0 in accordance with the
8337          *   standard (OS/2 EMX)
8338          *
8339          * We can achieve the desired consistency by inserting our
8340          * own '0x' or '0X' prefix, and substituting %x/%X in place
8341          * of %#x/%#X.
8342          *
8343          * Note that this is the same approach as used in
8344          * formatint() in stringobject.c
8345          */
8346         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8347                       sign, type, prec, type);
8348     }
8349     else {
8350         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8351                       sign, (flags&F_ALT) ? "#" : "",
8352                       prec, type);
8353     }
8354     if (sign[0])
8355         return longtounicode(buf, buflen, fmt, -x);
8356     else
8357         return longtounicode(buf, buflen, fmt, x);
8358 }
8359
8360 static int
8361 formatchar(Py_UNICODE *buf,
8362            size_t buflen,
8363            PyObject *v)
8364 {
8365     /* presume that the buffer is at least 2 characters long */
8366     if (PyUnicode_Check(v)) {
8367         if (PyUnicode_GET_SIZE(v) != 1)
8368             goto onError;
8369         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8370     }
8371
8372     else if (PyString_Check(v)) {
8373         if (PyString_GET_SIZE(v) != 1)
8374             goto onError;
8375         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8376     }
8377
8378     else {
8379         /* Integer input truncated to a character */
8380         long x;
8381         x = PyInt_AsLong(v);
8382         if (x == -1 && PyErr_Occurred())
8383             goto onError;
8384 #ifdef Py_UNICODE_WIDE
8385         if (x < 0 || x > 0x10ffff) {
8386             PyErr_SetString(PyExc_OverflowError,
8387                             "%c arg not in range(0x110000) "
8388                             "(wide Python build)");
8389             return -1;
8390         }
8391 #else
8392         if (x < 0 || x > 0xffff) {
8393             PyErr_SetString(PyExc_OverflowError,
8394                             "%c arg not in range(0x10000) "
8395                             "(narrow Python build)");
8396             return -1;
8397         }
8398 #endif
8399         buf[0] = (Py_UNICODE) x;
8400     }
8401     buf[1] = '\0';
8402     return 1;
8403
8404  onError:
8405     PyErr_SetString(PyExc_TypeError,
8406                     "%c requires int or char");
8407     return -1;
8408 }
8409
8410 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8411
8412    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8413    chars are formatted. XXX This is a magic number. Each formatting
8414    routine does bounds checking to ensure no overflow, but a better
8415    solution may be to malloc a buffer of appropriate size for each
8416    format. For now, the current solution is sufficient.
8417 */
8418 #define FORMATBUFLEN (size_t)120
8419
8420 PyObject *PyUnicode_Format(PyObject *format,
8421                            PyObject *args)
8422 {
8423     Py_UNICODE *fmt, *res;
8424     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8425     int args_owned = 0;
8426     PyUnicodeObject *result = NULL;
8427     PyObject *dict = NULL;
8428     PyObject *uformat;
8429
8430     if (format == NULL || args == NULL) {
8431         PyErr_BadInternalCall();
8432         return NULL;
8433     }
8434     uformat = PyUnicode_FromObject(format);
8435     if (uformat == NULL)
8436         return NULL;
8437     fmt = PyUnicode_AS_UNICODE(uformat);
8438     fmtcnt = PyUnicode_GET_SIZE(uformat);
8439
8440     reslen = rescnt = fmtcnt + 100;
8441     result = _PyUnicode_New(reslen);
8442     if (result == NULL)
8443         goto onError;
8444     res = PyUnicode_AS_UNICODE(result);
8445
8446     if (PyTuple_Check(args)) {
8447         arglen = PyTuple_Size(args);
8448         argidx = 0;
8449     }
8450     else {
8451         arglen = -1;
8452         argidx = -2;
8453     }
8454     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8455         !PyObject_TypeCheck(args, &PyBaseString_Type))
8456         dict = args;
8457
8458     while (--fmtcnt >= 0) {
8459         if (*fmt != '%') {
8460             if (--rescnt < 0) {
8461                 rescnt = fmtcnt + 100;
8462                 reslen += rescnt;
8463                 if (_PyUnicode_Resize(&result, reslen) < 0)
8464                     goto onError;
8465                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8466                 --rescnt;
8467             }
8468             *res++ = *fmt++;
8469         }
8470         else {
8471             /* Got a format specifier */
8472             int flags = 0;
8473             Py_ssize_t width = -1;
8474             int prec = -1;
8475             Py_UNICODE c = '\0';
8476             Py_UNICODE fill;
8477             int isnumok;
8478             PyObject *v = NULL;
8479             PyObject *temp = NULL;
8480             Py_UNICODE *pbuf;
8481             Py_UNICODE sign;
8482             Py_ssize_t len;
8483             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8484
8485             fmt++;
8486             if (*fmt == '(') {
8487                 Py_UNICODE *keystart;
8488                 Py_ssize_t keylen;
8489                 PyObject *key;
8490                 int pcount = 1;
8491
8492                 if (dict == NULL) {
8493                     PyErr_SetString(PyExc_TypeError,
8494                                     "format requires a mapping");
8495                     goto onError;
8496                 }
8497                 ++fmt;
8498                 --fmtcnt;
8499                 keystart = fmt;
8500                 /* Skip over balanced parentheses */
8501                 while (pcount > 0 && --fmtcnt >= 0) {
8502                     if (*fmt == ')')
8503                         --pcount;
8504                     else if (*fmt == '(')
8505                         ++pcount;
8506                     fmt++;
8507                 }
8508                 keylen = fmt - keystart - 1;
8509                 if (fmtcnt < 0 || pcount > 0) {
8510                     PyErr_SetString(PyExc_ValueError,
8511                                     "incomplete format key");
8512                     goto onError;
8513                 }
8514 #if 0
8515                 /* keys are converted to strings using UTF-8 and
8516                    then looked up since Python uses strings to hold
8517                    variables names etc. in its namespaces and we
8518                    wouldn't want to break common idioms. */
8519                 key = PyUnicode_EncodeUTF8(keystart,
8520                                            keylen,
8521                                            NULL);
8522 #else
8523                 key = PyUnicode_FromUnicode(keystart, keylen);
8524 #endif
8525                 if (key == NULL)
8526                     goto onError;
8527                 if (args_owned) {
8528                     Py_DECREF(args);
8529                     args_owned = 0;
8530                 }
8531                 args = PyObject_GetItem(dict, key);
8532                 Py_DECREF(key);
8533                 if (args == NULL) {
8534                     goto onError;
8535                 }
8536                 args_owned = 1;
8537                 arglen = -1;
8538                 argidx = -2;
8539             }
8540             while (--fmtcnt >= 0) {
8541                 switch (c = *fmt++) {
8542                 case '-': flags |= F_LJUST; continue;
8543                 case '+': flags |= F_SIGN; continue;
8544                 case ' ': flags |= F_BLANK; continue;
8545                 case '#': flags |= F_ALT; continue;
8546                 case '0': flags |= F_ZERO; continue;
8547                 }
8548                 break;
8549             }
8550             if (c == '*') {
8551                 v = getnextarg(args, arglen, &argidx);
8552                 if (v == NULL)
8553                     goto onError;
8554                 if (!PyInt_Check(v)) {
8555                     PyErr_SetString(PyExc_TypeError,
8556                                     "* wants int");
8557                     goto onError;
8558                 }
8559                 width = PyInt_AsLong(v);
8560                 if (width < 0) {
8561                     flags |= F_LJUST;
8562                     width = -width;
8563                 }
8564                 if (--fmtcnt >= 0)
8565                     c = *fmt++;
8566             }
8567             else if (c >= '0' && c <= '9') {
8568                 width = c - '0';
8569                 while (--fmtcnt >= 0) {
8570                     c = *fmt++;
8571                     if (c < '0' || c > '9')
8572                         break;
8573                     if ((width*10) / 10 != width) {
8574                         PyErr_SetString(PyExc_ValueError,
8575                                         "width too big");
8576                         goto onError;
8577                     }
8578                     width = width*10 + (c - '0');
8579                 }
8580             }
8581             if (c == '.') {
8582                 prec = 0;
8583                 if (--fmtcnt >= 0)
8584                     c = *fmt++;
8585                 if (c == '*') {
8586                     v = getnextarg(args, arglen, &argidx);
8587                     if (v == NULL)
8588                         goto onError;
8589                     if (!PyInt_Check(v)) {
8590                         PyErr_SetString(PyExc_TypeError,
8591                                         "* wants int");
8592                         goto onError;
8593                     }
8594                     prec = PyInt_AsLong(v);
8595                     if (prec < 0)
8596                         prec = 0;
8597                     if (--fmtcnt >= 0)
8598                         c = *fmt++;
8599                 }
8600                 else if (c >= '0' && c <= '9') {
8601                     prec = c - '0';
8602                     while (--fmtcnt >= 0) {
8603                         c = Py_CHARMASK(*fmt++);
8604                         if (c < '0' || c > '9')
8605                             break;
8606                         if ((prec*10) / 10 != prec) {
8607                             PyErr_SetString(PyExc_ValueError,
8608                                             "prec too big");
8609                             goto onError;
8610                         }
8611                         prec = prec*10 + (c - '0');
8612                     }
8613                 }
8614             } /* prec */
8615             if (fmtcnt >= 0) {
8616                 if (c == 'h' || c == 'l' || c == 'L') {
8617                     if (--fmtcnt >= 0)
8618                         c = *fmt++;
8619                 }
8620             }
8621             if (fmtcnt < 0) {
8622                 PyErr_SetString(PyExc_ValueError,
8623                                 "incomplete format");
8624                 goto onError;
8625             }
8626             if (c != '%') {
8627                 v = getnextarg(args, arglen, &argidx);
8628                 if (v == NULL)
8629                     goto onError;
8630             }
8631             sign = 0;
8632             fill = ' ';
8633             switch (c) {
8634
8635             case '%':
8636                 pbuf = formatbuf;
8637                 /* presume that buffer length is at least 1 */
8638                 pbuf[0] = '%';
8639                 len = 1;
8640                 break;
8641
8642             case 's':
8643             case 'r':
8644                 if (PyUnicode_Check(v) && c == 's') {
8645                     temp = v;
8646                     Py_INCREF(temp);
8647                 }
8648                 else {
8649                     PyObject *unicode;
8650                     if (c == 's')
8651                         temp = PyObject_Unicode(v);
8652                     else
8653                         temp = PyObject_Repr(v);
8654                     if (temp == NULL)
8655                         goto onError;
8656                     if (PyUnicode_Check(temp))
8657                         /* nothing to do */;
8658                     else if (PyString_Check(temp)) {
8659                         /* convert to string to Unicode */
8660                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8661                                                    PyString_GET_SIZE(temp),
8662                                                    NULL,
8663                                                    "strict");
8664                         Py_DECREF(temp);
8665                         temp = unicode;
8666                         if (temp == NULL)
8667                             goto onError;
8668                     }
8669                     else {
8670                         Py_DECREF(temp);
8671                         PyErr_SetString(PyExc_TypeError,
8672                                         "%s argument has non-string str()");
8673                         goto onError;
8674                     }
8675                 }
8676                 pbuf = PyUnicode_AS_UNICODE(temp);
8677                 len = PyUnicode_GET_SIZE(temp);
8678                 if (prec >= 0 && len > prec)
8679                     len = prec;
8680                 break;
8681
8682             case 'i':
8683             case 'd':
8684             case 'u':
8685             case 'o':
8686             case 'x':
8687             case 'X':
8688                 if (c == 'i')
8689                     c = 'd';
8690                 isnumok = 0;
8691                 if (PyNumber_Check(v)) {
8692                         PyObject *iobj=NULL;
8693
8694                         if (PyInt_Check(v) || (PyLong_Check(v))) {
8695                                 iobj = v;
8696                                 Py_INCREF(iobj);
8697                         }
8698                         else {
8699                                 iobj = PyNumber_Int(v);
8700                                 if (iobj==NULL) iobj = PyNumber_Long(v);
8701                         }
8702                         if (iobj!=NULL) {
8703                                 if (PyInt_Check(iobj)) {
8704                                         isnumok = 1;
8705                                         pbuf = formatbuf;
8706                                         len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8707                                                     flags, prec, c, iobj);
8708                                         Py_DECREF(iobj);
8709                                         if (len < 0)
8710                                             goto onError;
8711                                         sign = 1;
8712                                 }
8713                                 else if (PyLong_Check(iobj)) {
8714                                         isnumok = 1;
8715                                         temp = formatlong(iobj, flags, prec, c);
8716                                         Py_DECREF(iobj);
8717                                         if (!temp)
8718                                             goto onError;
8719                                         pbuf = PyUnicode_AS_UNICODE(temp);
8720                                         len = PyUnicode_GET_SIZE(temp);
8721                                         sign = 1;
8722                                 }
8723                                 else {
8724                                         Py_DECREF(iobj);
8725                                 }
8726                         }
8727                 }
8728                 if (!isnumok) {
8729                         PyErr_Format(PyExc_TypeError,
8730                             "%%%c format: a number is required, "
8731                                      "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8732                         goto onError;
8733                 }
8734                 if (flags & F_ZERO)
8735                     fill = '0';
8736                 break;
8737
8738             case 'e':
8739             case 'E':
8740             case 'f':
8741             case 'F':
8742             case 'g':
8743             case 'G':
8744                 if (c == 'F')
8745                         c = 'f';
8746                 pbuf = formatbuf;
8747                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8748                         flags, prec, c, v);
8749                 if (len < 0)
8750                     goto onError;
8751                 sign = 1;
8752                 if (flags & F_ZERO)
8753                     fill = '0';
8754                 break;
8755
8756             case 'c':
8757                 pbuf = formatbuf;
8758                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8759                 if (len < 0)
8760                     goto onError;
8761                 break;
8762
8763             default:
8764                 PyErr_Format(PyExc_ValueError,
8765                              "unsupported format character '%c' (0x%x) "
8766                              "at index %zd",
8767                              (31<=c && c<=126) ? (char)c : '?',
8768                              (int)c,
8769                              (Py_ssize_t)(fmt - 1 -
8770                                           PyUnicode_AS_UNICODE(uformat)));
8771                 goto onError;
8772             }
8773             if (sign) {
8774                 if (*pbuf == '-' || *pbuf == '+') {
8775                     sign = *pbuf++;
8776                     len--;
8777                 }
8778                 else if (flags & F_SIGN)
8779                     sign = '+';
8780                 else if (flags & F_BLANK)
8781                     sign = ' ';
8782                 else
8783                     sign = 0;
8784             }
8785             if (width < len)
8786                 width = len;
8787             if (rescnt - (sign != 0) < width) {
8788                 reslen -= rescnt;
8789                 rescnt = width + fmtcnt + 100;
8790                 reslen += rescnt;
8791                 if (reslen < 0) {
8792                     Py_XDECREF(temp);
8793                     PyErr_NoMemory();
8794                     goto onError;
8795                 }
8796                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8797                     Py_XDECREF(temp);
8798                     goto onError;
8799                 }
8800                 res = PyUnicode_AS_UNICODE(result)
8801                     + reslen - rescnt;
8802             }
8803             if (sign) {
8804                 if (fill != ' ')
8805                     *res++ = sign;
8806                 rescnt--;
8807                 if (width > len)
8808                     width--;
8809             }
8810             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8811                 assert(pbuf[0] == '0');
8812                 assert(pbuf[1] == c);
8813                 if (fill != ' ') {
8814                     *res++ = *pbuf++;
8815                     *res++ = *pbuf++;
8816                 }
8817                 rescnt -= 2;
8818                 width -= 2;
8819                 if (width < 0)
8820                     width = 0;
8821                 len -= 2;
8822             }
8823             if (width > len && !(flags & F_LJUST)) {
8824                 do {
8825                     --rescnt;
8826                     *res++ = fill;
8827                 } while (--width > len);
8828             }
8829             if (fill == ' ') {
8830                 if (sign)
8831                     *res++ = sign;
8832                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8833                     assert(pbuf[0] == '0');
8834                     assert(pbuf[1] == c);
8835                     *res++ = *pbuf++;
8836                     *res++ = *pbuf++;
8837                 }
8838             }
8839             Py_UNICODE_COPY(res, pbuf, len);
8840             res += len;
8841             rescnt -= len;
8842             while (--width >= len) {
8843                 --rescnt;
8844                 *res++ = ' ';
8845             }
8846             if (dict && (argidx < arglen) && c != '%') {
8847                 PyErr_SetString(PyExc_TypeError,
8848                                 "not all arguments converted during string formatting");
8849                 Py_XDECREF(temp);
8850                 goto onError;
8851             }
8852             Py_XDECREF(temp);
8853         } /* '%' */
8854     } /* until end */
8855     if (argidx < arglen && !dict) {
8856         PyErr_SetString(PyExc_TypeError,
8857                         "not all arguments converted during string formatting");
8858         goto onError;
8859     }
8860
8861     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8862         goto onError;
8863     if (args_owned) {
8864         Py_DECREF(args);
8865     }
8866     Py_DECREF(uformat);
8867     return (PyObject *)result;
8868
8869  onError:
8870     Py_XDECREF(result);
8871     Py_DECREF(uformat);
8872     if (args_owned) {
8873         Py_DECREF(args);
8874     }
8875     return NULL;
8876 }
8877
8878 static PyBufferProcs unicode_as_buffer = {
8879     (readbufferproc) unicode_buffer_getreadbuf,
8880     (writebufferproc) unicode_buffer_getwritebuf,
8881     (segcountproc) unicode_buffer_getsegcount,
8882     (charbufferproc) unicode_buffer_getcharbuf,
8883 };
8884
8885 static PyObject *
8886 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8887
8888 static PyObject *
8889 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8890 {
8891         PyObject *x = NULL;
8892         static char *kwlist[] = {"string", "encoding", "errors", 0};
8893         char *encoding = NULL;
8894         char *errors = NULL;
8895
8896         if (type != &PyUnicode_Type)
8897                 return unicode_subtype_new(type, args, kwds);
8898         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8899                                           kwlist, &x, &encoding, &errors))
8900             return NULL;
8901         if (x == NULL)
8902                 return (PyObject *)_PyUnicode_New(0);
8903         if (encoding == NULL && errors == NULL)
8904             return PyObject_Unicode(x);
8905         else
8906         return PyUnicode_FromEncodedObject(x, encoding, errors);
8907 }
8908
8909 static PyObject *
8910 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8911 {
8912         PyUnicodeObject *tmp, *pnew;
8913         Py_ssize_t n;
8914
8915         assert(PyType_IsSubtype(type, &PyUnicode_Type));
8916         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8917         if (tmp == NULL)
8918                 return NULL;
8919         assert(PyUnicode_Check(tmp));
8920         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8921         if (pnew == NULL) {
8922                 Py_DECREF(tmp);
8923                 return NULL;
8924         }
8925         pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8926         if (pnew->str == NULL) {
8927                 _Py_ForgetReference((PyObject *)pnew);
8928                 PyObject_Del(pnew);
8929                 Py_DECREF(tmp);
8930                 return PyErr_NoMemory();
8931         }
8932         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8933         pnew->length = n;
8934         pnew->hash = tmp->hash;
8935         Py_DECREF(tmp);
8936         return (PyObject *)pnew;
8937 }
8938
8939 PyDoc_STRVAR(unicode_doc,
8940 "unicode(string [, encoding[, errors]]) -> object\n\
8941 \n\
8942 Create a new Unicode object from the given encoded string.\n\
8943 encoding defaults to the current default string encoding.\n\
8944 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8945
8946 PyTypeObject PyUnicode_Type = {
8947     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8948     "unicode",                          /* tp_name */
8949     sizeof(PyUnicodeObject),            /* tp_size */
8950     0,                                  /* tp_itemsize */
8951     /* Slots */
8952     (destructor)unicode_dealloc,        /* tp_dealloc */
8953     0,                                  /* tp_print */
8954     0,                                  /* tp_getattr */
8955     0,                                  /* tp_setattr */
8956     0,                                  /* tp_compare */
8957     unicode_repr,                       /* tp_repr */
8958     &unicode_as_number,                 /* tp_as_number */
8959     &unicode_as_sequence,               /* tp_as_sequence */
8960     &unicode_as_mapping,                /* tp_as_mapping */
8961     (hashfunc) unicode_hash,            /* tp_hash*/
8962     0,                                  /* tp_call*/
8963     (reprfunc) unicode_str,             /* tp_str */
8964     PyObject_GenericGetAttr,            /* tp_getattro */
8965     0,                                  /* tp_setattro */
8966     &unicode_as_buffer,                 /* tp_as_buffer */
8967     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8968             Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8969     unicode_doc,                        /* tp_doc */
8970     0,                                  /* tp_traverse */
8971     0,                                  /* tp_clear */
8972     PyUnicode_RichCompare,              /* tp_richcompare */
8973     0,                                  /* tp_weaklistoffset */
8974     0,                                  /* tp_iter */
8975     0,                                  /* tp_iternext */
8976     unicode_methods,                    /* tp_methods */
8977     0,                                  /* tp_members */
8978     0,                                  /* tp_getset */
8979     &PyBaseString_Type,                 /* tp_base */
8980     0,                                  /* tp_dict */
8981     0,                                  /* tp_descr_get */
8982     0,                                  /* tp_descr_set */
8983     0,                                  /* tp_dictoffset */
8984     0,                                  /* tp_init */
8985     0,                                  /* tp_alloc */
8986     unicode_new,                        /* tp_new */
8987     PyObject_Del,               /* tp_free */
8988 };
8989
8990 /* Initialize the Unicode implementation */
8991
8992 void _PyUnicode_Init(void)
8993 {
8994     int i;
8995
8996     /* XXX - move this array to unicodectype.c ? */
8997     Py_UNICODE linebreak[] = {
8998         0x000A, /* LINE FEED */
8999         0x000D, /* CARRIAGE RETURN */
9000         0x001C, /* FILE SEPARATOR */
9001         0x001D, /* GROUP SEPARATOR */
9002         0x001E, /* RECORD SEPARATOR */
9003         0x0085, /* NEXT LINE */
9004         0x2028, /* LINE SEPARATOR */
9005         0x2029, /* PARAGRAPH SEPARATOR */
9006     };
9007
9008     /* Init the implementation */
9009     free_list = NULL;
9010     numfree = 0;
9011     unicode_empty = _PyUnicode_New(0);
9012     if (!unicode_empty)
9013         return;
9014
9015     strcpy(unicode_default_encoding, "ascii");
9016     for (i = 0; i < 256; i++)
9017         unicode_latin1[i] = NULL;
9018     if (PyType_Ready(&PyUnicode_Type) < 0)
9019         Py_FatalError("Can't initialize 'unicode'");
9020
9021     /* initialize the linebreak bloom filter */
9022     bloom_linebreak = make_bloom_mask(
9023         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9024         );
9025
9026     PyType_Ready(&EncodingMapType);
9027 }
9028
9029 /* Finalize the Unicode implementation */
9030
9031 int
9032 PyUnicode_ClearFreeList(void)
9033 {
9034     int freelist_size = numfree;
9035     PyUnicodeObject *u;
9036
9037     for (u = free_list; u != NULL;) {
9038         PyUnicodeObject *v = u;
9039         u = *(PyUnicodeObject **)u;
9040         if (v->str)
9041             PyObject_DEL(v->str);
9042         Py_XDECREF(v->defenc);
9043         PyObject_Del(v);
9044         numfree--;
9045     }
9046     free_list = NULL;
9047     assert(numfree == 0);
9048     return freelist_size;
9049 }
9050
9051 void
9052 _PyUnicode_Fini(void)
9053 {
9054     int i;
9055
9056     Py_XDECREF(unicode_empty);
9057     unicode_empty = NULL;
9058
9059     for (i = 0; i < 256; i++) {
9060         if (unicode_latin1[i]) {
9061             Py_DECREF(unicode_latin1[i]);
9062             unicode_latin1[i] = NULL;
9063         }
9064     }
9065     (void)PyUnicode_ClearFreeList();
9066 }
9067
9068 #ifdef __cplusplus
9069 }
9070 #endif
9071
9072
9073 /*
9074 Local variables:
9075 c-basic-offset: 4
9076 indent-tabs-mode: nil
9077 End:
9078 */