Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15     Copyright (c) 1999 by Secret Labs AB
  16     Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117         0, 0, 0, 0, 0, 0, 0, 0,
 118 //     case 0x0009: /* HORIZONTAL TABULATION */
 119 //     case 0x000A: /* LINE FEED */
 120 //     case 0x000B: /* VERTICAL TABULATION */
 121 //     case 0x000C: /* FORM FEED */
 122 //     case 0x000D: /* CARRIAGE RETURN */
 123         0, 1, 1, 1, 1, 1, 0, 0,
 124         0, 0, 0, 0, 0, 0, 0, 0,
 125 //     case 0x001C: /* FILE SEPARATOR */
 126 //     case 0x001D: /* GROUP SEPARATOR */
 127 //     case 0x001E: /* RECORD SEPARATOR */
 128 //     case 0x001F: /* UNIT SEPARATOR */
 129         0, 0, 0, 0, 1, 1, 1, 1,
 130 //     case 0x0020: /* SPACE */
 131         1, 0, 0, 0, 0, 0, 0, 0,
 132         0, 0, 0, 0, 0, 0, 0, 0,
 133         0, 0, 0, 0, 0, 0, 0, 0,
 134         0, 0, 0, 0, 0, 0, 0, 0,
 135
 136         0, 0, 0, 0, 0, 0, 0, 0,
 137         0, 0, 0, 0, 0, 0, 0, 0,
 138         0, 0, 0, 0, 0, 0, 0, 0,
 139         0, 0, 0, 0, 0, 0, 0, 0,
 140         0, 0, 0, 0, 0, 0, 0, 0,
 141         0, 0, 0, 0, 0, 0, 0, 0,
 142         0, 0, 0, 0, 0, 0, 0, 0,
 143         0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148         0, 0, 0, 0, 0, 0, 0, 0,
 149 //         0x000A, /* LINE FEED */
 150 //         0x000D, /* CARRIAGE RETURN */
 151         0, 0, 1, 0, 0, 1, 0, 0,
 152         0, 0, 0, 0, 0, 0, 0, 0,
 153 //         0x001C, /* FILE SEPARATOR */
 154 //         0x001D, /* GROUP SEPARATOR */
 155 //         0x001E, /* RECORD SEPARATOR */
 156         0, 0, 0, 0, 1, 1, 1, 0,
 157         0, 0, 0, 0, 0, 0, 0, 0,
 158         0, 0, 0, 0, 0, 0, 0, 0,
 159         0, 0, 0, 0, 0, 0, 0, 0,
 160         0, 0, 0, 0, 0, 0, 0, 0,
 161
 162         0, 0, 0, 0, 0, 0, 0, 0,
 163         0, 0, 0, 0, 0, 0, 0, 0,
 164         0, 0, 0, 0, 0, 0, 0, 0,
 165         0, 0, 0, 0, 0, 0, 0, 0,
 166         0, 0, 0, 0, 0, 0, 0, 0,
 167         0, 0, 0, 0, 0, 0, 0, 0,
 168         0, 0, 0, 0, 0, 0, 0, 0,
 169         0, 0, 0, 0, 0, 0, 0, 0
 170 };
 171
 172
 173 Py_UNICODE
 174 PyUnicode_GetMax(void)
 175 {
 176 #ifdef Py_UNICODE_WIDE
 177         return 0x10FFFF;
 178 #else
 179         /* This is actually an illegal character, so it should
 180            not be passed to unichr. */
 181         return 0xFFFF;
 182 #endif
 183 }
 184
 185 /* --- Bloom Filters ----------------------------------------------------- */
 186
 187 /* stuff to implement simple "bloom filters" for Unicode characters.
 188    to keep things simple, we use a single bitmask, using the least 5
 189    bits from each unicode characters as the bit index. */
 190
 191 /* the linebreak mask is set up by Unicode_Init below */
 192
 193 #define BLOOM_MASK unsigned long
 194
 195 static BLOOM_MASK bloom_linebreak;
 196
 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
 198
 199 #define BLOOM_LINEBREAK(ch) \
 200     ((ch) < 128U ? ascii_linebreak[(ch)] : \
 201     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 202
 203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 204 {
 205     /* calculate simple bloom-style bitmask for a given unicode string */
 206
 207     long mask;
 208     Py_ssize_t i;
 209
 210     mask = 0;
 211     for (i = 0; i < len; i++)
 212         mask |= (1 << (ptr[i] & 0x1F));
 213
 214     return mask;
 215 }
 216
 217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 218 {
 219     Py_ssize_t i;
 220
 221     for (i = 0; i < setlen; i++)
 222         if (set[i] == chr)
 223             return 1;
 224
 225     return 0;
 226 }
 227
 228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
 229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 230
 231 /* --- Unicode Object ----------------------------------------------------- */
 232
 233 static
 234 int unicode_resize(register PyUnicodeObject *unicode,
 235                       Py_ssize_t length)
 236 {
 237     void *oldstr;
 238
 239     /* Shortcut if there's nothing much to do. */
 240     if (unicode->length == length)
 241         goto reset;
 242
 243     /* Resizing shared object (unicode_empty or single character
 244        objects) in-place is not allowed. Use PyUnicode_Resize()
 245        instead ! */
 246
 247     if (unicode == unicode_empty ||
 248         (unicode->length == 1 &&
 249          unicode->str[0] < 256U &&
 250          unicode_latin1[unicode->str[0]] == unicode)) {
 251         PyErr_SetString(PyExc_SystemError,
 252                         "can't resize shared unicode objects");
 253         return -1;
 254     }
 255
 256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 257        The overallocation is also used by fastsearch, which assumes that it's
 258        safe to look at str[length] (without making any assumptions about what
 259        it contains). */
 260
 261     oldstr = unicode->str;
 262     unicode->str = PyObject_REALLOC(unicode->str,
 263                                     sizeof(Py_UNICODE) * (length + 1));
 264     if (!unicode->str) {
 265         unicode->str = (Py_UNICODE *)oldstr;
 266         PyErr_NoMemory();
 267         return -1;
 268     }
 269     unicode->str[length] = 0;
 270     unicode->length = length;
 271
 272  reset:
 273     /* Reset the object caches */
 274     if (unicode->defenc) {
 275         Py_DECREF(unicode->defenc);
 276         unicode->defenc = NULL;
 277     }
 278     unicode->hash = -1;
 279
 280     return 0;
 281 }
 282
 283 /* We allocate one more byte to make sure the string is
 284    Ux0000 terminated -- XXX is this needed ?
 285
 286    XXX This allocator could further be enhanced by assuring that the
 287        free list never reduces its size below 1.
 288
 289 */
 290
 291 static
 292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 293 {
 294     register PyUnicodeObject *unicode;
 295
 296     /* Optimization for empty strings */
 297     if (length == 0 && unicode_empty != NULL) {
 298         Py_INCREF(unicode_empty);
 299         return unicode_empty;
 300     }
 301
 302     /* Ensure we won't overflow the size. */
 303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 304         return (PyUnicodeObject *)PyErr_NoMemory();
 305     }
 306
 307     /* Unicode freelist & memory allocation */
 308     if (free_list) {
 309         unicode = free_list;
 310         free_list = *(PyUnicodeObject **)unicode;
 311         numfree--;
 312         if (unicode->str) {
 313             /* Keep-Alive optimization: we only upsize the buffer,
 314                never downsize it. */
 315             if ((unicode->length < length) &&
 316                 unicode_resize(unicode, length) < 0) {
 317                 PyObject_DEL(unicode->str);
 318                 unicode->str = NULL;
 319             }
 320         }
 321         else {
 322             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 323             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 324         }
 325         PyObject_INIT(unicode, &PyUnicode_Type);
 326     }
 327     else {
 328         size_t new_size;
 329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 330         if (unicode == NULL)
 331             return NULL;
 332         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 333         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 334     }
 335
 336     if (!unicode->str) {
 337         PyErr_NoMemory();
 338         goto onError;
 339     }
 340     /* Initialize the first element to guard against cases where
 341      * the caller fails before initializing str -- unicode_resize()
 342      * reads str[0], and the Keep-Alive optimization can keep memory
 343      * allocated for str alive across a call to unicode_dealloc(unicode).
 344      * We don't want unicode_resize to read uninitialized memory in
 345      * that case.
 346      */
 347     unicode->str[0] = 0;
 348     unicode->str[length] = 0;
 349     unicode->length = length;
 350     unicode->hash = -1;
 351     unicode->defenc = NULL;
 352     return unicode;
 353
 354  onError:
 355     /* XXX UNREF/NEWREF interface should be more symmetrical */
 356     _Py_DEC_REFTOTAL;
 357     _Py_ForgetReference((PyObject *)unicode);
 358     PyObject_Del(unicode);
 359     return NULL;
 360 }
 361
 362 static
 363 void unicode_dealloc(register PyUnicodeObject *unicode)
 364 {
 365     if (PyUnicode_CheckExact(unicode) &&
 366         numfree < PyUnicode_MAXFREELIST) {
 367         /* Keep-Alive optimization */
 368         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 369             PyObject_DEL(unicode->str);
 370             unicode->str = NULL;
 371             unicode->length = 0;
 372         }
 373         if (unicode->defenc) {
 374             Py_DECREF(unicode->defenc);
 375             unicode->defenc = NULL;
 376         }
 377         /* Add to free list */
 378         *(PyUnicodeObject **)unicode = free_list;
 379         free_list = unicode;
 380         numfree++;
 381     }
 382     else {
 383         PyObject_DEL(unicode->str);
 384         Py_XDECREF(unicode->defenc);
 385         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 386     }
 387 }
 388
 389 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 390 {
 391     register PyUnicodeObject *v;
 392
 393     /* Argument checks */
 394     if (unicode == NULL) {
 395         PyErr_BadInternalCall();
 396         return -1;
 397     }
 398     v = (PyUnicodeObject *)*unicode;
 399     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 400         PyErr_BadInternalCall();
 401         return -1;
 402     }
 403
 404     /* Resizing unicode_empty and single character objects is not
 405        possible since these are being shared. We simply return a fresh
 406        copy with the same Unicode content. */
 407     if (v->length != length &&
 408         (v == unicode_empty || v->length == 1)) {
 409         PyUnicodeObject *w = _PyUnicode_New(length);
 410         if (w == NULL)
 411             return -1;
 412         Py_UNICODE_COPY(w->str, v->str,
 413                         length < v->length ? length : v->length);
 414         Py_DECREF(*unicode);
 415         *unicode = (PyObject *)w;
 416         return 0;
 417     }
 418
 419     /* Note that we don't have to modify *unicode for unshared Unicode
 420        objects, since we can modify them in-place. */
 421     return unicode_resize(v, length);
 422 }
 423
 424 /* Internal API for use in unicodeobject.c only ! */
 425 #define _PyUnicode_Resize(unicodevar, length) \
 426         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 427
 428 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 429                                 Py_ssize_t size)
 430 {
 431     PyUnicodeObject *unicode;
 432
 433     /* If the Unicode data is known at construction time, we can apply
 434        some optimizations which share commonly used objects. */
 435     if (u != NULL) {
 436
 437         /* Optimization for empty strings */
 438         if (size == 0 && unicode_empty != NULL) {
 439             Py_INCREF(unicode_empty);
 440             return (PyObject *)unicode_empty;
 441         }
 442
 443         /* Single character Unicode objects in the Latin-1 range are
 444            shared when using this constructor */
 445         if (size == 1 && *u < 256) {
 446             unicode = unicode_latin1[*u];
 447             if (!unicode) {
 448                 unicode = _PyUnicode_New(1);
 449                 if (!unicode)
 450                     return NULL;
 451                 unicode->str[0] = *u;
 452                 unicode_latin1[*u] = unicode;
 453             }
 454             Py_INCREF(unicode);
 455             return (PyObject *)unicode;
 456         }
 457     }
 458
 459     unicode = _PyUnicode_New(size);
 460     if (!unicode)
 461         return NULL;
 462
 463     /* Copy the Unicode data into the new object */
 464     if (u != NULL)
 465         Py_UNICODE_COPY(unicode->str, u, size);
 466
 467     return (PyObject *)unicode;
 468 }
 469
 470 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 471 {
 472     PyUnicodeObject *unicode;
 473
 474         if (size < 0) {
 475                 PyErr_SetString(PyExc_SystemError,
 476                     "Negative size passed to PyUnicode_FromStringAndSize");
 477                 return NULL;
 478         }
 479
 480     /* If the Unicode data is known at construction time, we can apply
 481        some optimizations which share commonly used objects.
 482        Also, this means the input must be UTF-8, so fall back to the
 483        UTF-8 decoder at the end. */
 484     if (u != NULL) {
 485
 486         /* Optimization for empty strings */
 487         if (size == 0 && unicode_empty != NULL) {
 488             Py_INCREF(unicode_empty);
 489             return (PyObject *)unicode_empty;
 490         }
 491
 492         /* Single characters are shared when using this constructor.
 493            Restrict to ASCII, since the input must be UTF-8. */
 494         if (size == 1 && Py_CHARMASK(*u) < 128) {
 495             unicode = unicode_latin1[Py_CHARMASK(*u)];
 496             if (!unicode) {
 497                 unicode = _PyUnicode_New(1);
 498                 if (!unicode)
 499                     return NULL;
 500                 unicode->str[0] = Py_CHARMASK(*u);
 501                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 502             }
 503             Py_INCREF(unicode);
 504             return (PyObject *)unicode;
 505         }
 506
 507         return PyUnicode_DecodeUTF8(u, size, NULL);
 508     }
 509
 510     unicode = _PyUnicode_New(size);
 511     if (!unicode)
 512         return NULL;
 513
 514     return (PyObject *)unicode;
 515 }
 516
 517 PyObject *PyUnicode_FromString(const char *u)
 518 {
 519     size_t size = strlen(u);
 520     if (size > PY_SSIZE_T_MAX) {
 521         PyErr_SetString(PyExc_OverflowError, "input too long");
 522         return NULL;
 523     }
 524
 525     return PyUnicode_FromStringAndSize(u, size);
 526 }
 527
 528 #ifdef HAVE_WCHAR_H
 529
 530 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 531                                  Py_ssize_t size)
 532 {
 533     PyUnicodeObject *unicode;
 534
 535     if (w == NULL) {
 536         PyErr_BadInternalCall();
 537         return NULL;
 538     }
 539
 540     unicode = _PyUnicode_New(size);
 541     if (!unicode)
 542         return NULL;
 543
 544     /* Copy the wchar_t data into the new object */
 545 #ifdef HAVE_USABLE_WCHAR_T
 546     memcpy(unicode->str, w, size * sizeof(wchar_t));
 547 #else
 548     {
 549         register Py_UNICODE *u;
 550         register Py_ssize_t i;
 551         u = PyUnicode_AS_UNICODE(unicode);
 552         for (i = size; i > 0; i--)
 553             *u++ = *w++;
 554     }
 555 #endif
 556
 557     return (PyObject *)unicode;
 558 }
 559
 560 static void
 561 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 562 {
 563         *fmt++ = '%';
 564         if (width) {
 565                 if (zeropad)
 566                         *fmt++ = '0';
 567                 fmt += sprintf(fmt, "%d", width);
 568         }
 569         if (precision)
 570                 fmt += sprintf(fmt, ".%d", precision);
 571         if (longflag)
 572                 *fmt++ = 'l';
 573         else if (size_tflag) {
 574                 char *f = PY_FORMAT_SIZE_T;
 575                 while (*f)
 576                         *fmt++ = *f++;
 577         }
 578         *fmt++ = c;
 579         *fmt = '\0';
 580 }
 581
 582 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 583
 584 PyObject *
 585 PyUnicode_FromFormatV(const char *format, va_list vargs)
 586 {
 587         va_list count;
 588         Py_ssize_t callcount = 0;
 589         PyObject **callresults = NULL;
 590         PyObject **callresult = NULL;
 591         Py_ssize_t n = 0;
 592         int width = 0;
 593         int precision = 0;
 594         int zeropad;
 595         const char* f;
 596         Py_UNICODE *s;
 597         PyObject *string;
 598         /* used by sprintf */
 599         char buffer[21];
 600         /* use abuffer instead of buffer, if we need more space
 601          * (which can happen if there's a format specifier with width). */
 602         char *abuffer = NULL;
 603         char *realbuffer;
 604         Py_ssize_t abuffersize = 0;
 605         char fmt[60]; /* should be enough for %0width.precisionld */
 606         const char *copy;
 607
 608 #ifdef VA_LIST_IS_ARRAY
 609         Py_MEMCPY(count, vargs, sizeof(va_list));
 610 #else
 611 #ifdef  __va_copy
 612         __va_copy(count, vargs);
 613 #else
 614         count = vargs;
 615 #endif
 616 #endif
 617         /* step 1: count the number of %S/%R format specifications
 618          * (we call PyObject_Str()/PyObject_Repr() for these objects
 619          * once during step 3 and put the result in an array) */
 620         for (f = format; *f; f++) {
 621                 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
 622                         ++callcount;
 623         }
 624         /* step 2: allocate memory for the results of
 625          * PyObject_Str()/PyObject_Repr() calls */
 626         if (callcount) {
 627                 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 628                 if (!callresults) {
 629                         PyErr_NoMemory();
 630                         return NULL;
 631                 }
 632                 callresult = callresults;
 633         }
 634         /* step 3: figure out how large a buffer we need */
 635         for (f = format; *f; f++) {
 636                 if (*f == '%') {
 637                         const char* p = f;
 638                         width = 0;
 639                         while (isdigit((unsigned)*f))
 640                                 width = (width*10) + *f++ - '0';
 641                         while (*++f && *f != '%' && !isalpha((unsigned)*f))
 642                                 ;
 643
 644                         /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 645                          * they don't affect the amount of space we reserve.
 646                          */
 647                         if ((*f == 'l' || *f == 'z') &&
 648                                         (f[1] == 'd' || f[1] == 'u'))
 649                                 ++f;
 650
 651                         switch (*f) {
 652                         case 'c':
 653                                 (void)va_arg(count, int);
 654                                 /* fall through... */
 655                         case '%':
 656                                 n++;
 657                                 break;
 658                         case 'd': case 'u': case 'i': case 'x':
 659                                 (void) va_arg(count, int);
 660                                 /* 20 bytes is enough to hold a 64-bit
 661                                    integer.  Decimal takes the most space.
 662                                    This isn't enough for octal.
 663                                    If a width is specified we need more
 664                                    (which we allocate later). */
 665                                 if (width < 20)
 666                                         width = 20;
 667                                 n += width;
 668                                 if (abuffersize < width)
 669                                         abuffersize = width;
 670                                 break;
 671                         case 's':
 672                         {
 673                                 /* UTF-8 */
 674                                 unsigned char*s;
 675                                 s = va_arg(count, unsigned char*);
 676                                 while (*s) {
 677                                         if (*s < 128) {
 678                                                 n++; s++;
 679                                         } else if (*s < 0xc0) {
 680                                                 /* invalid UTF-8 */
 681                                                 n++; s++;
 682                                         } else if (*s < 0xc0) {
 683                                                 n++;
 684                                                 s++; if(!*s)break;
 685                                                 s++;
 686                                         } else if (*s < 0xe0) {
 687                                                 n++;
 688                                                 s++; if(!*s)break;
 689                                                 s++; if(!*s)break;
 690                                                 s++;
 691                                         } else {
 692                                                 #ifdef Py_UNICODE_WIDE
 693                                                 n++;
 694                                                 #else
 695                                                 n+=2;
 696                                                 #endif
 697                                                 s++; if(!*s)break;
 698                                                 s++; if(!*s)break;
 699                                                 s++; if(!*s)break;
 700                                                 s++;
 701                                         }
 702                                 }
 703                                 break;
 704                         }
 705                         case 'U':
 706                         {
 707                                 PyObject *obj = va_arg(count, PyObject *);
 708                                 assert(obj && PyUnicode_Check(obj));
 709                                 n += PyUnicode_GET_SIZE(obj);
 710                                 break;
 711                         }
 712                         case 'V':
 713                         {
 714                                 PyObject *obj = va_arg(count, PyObject *);
 715                                 const char *str = va_arg(count, const char *);
 716                                 assert(obj || str);
 717                                 assert(!obj || PyUnicode_Check(obj));
 718                                 if (obj)
 719                                         n += PyUnicode_GET_SIZE(obj);
 720                                 else
 721                                         n += strlen(str);
 722                                 break;
 723                         }
 724                         case 'S':
 725                         {
 726                                 PyObject *obj = va_arg(count, PyObject *);
 727                                 PyObject *str;
 728                                 assert(obj);
 729                                 str = PyObject_Str(obj);
 730                                 if (!str)
 731                                         goto fail;
 732                                 n += PyUnicode_GET_SIZE(str);
 733                                 /* Remember the str and switch to the next slot */
 734                                 *callresult++ = str;
 735                                 break;
 736                         }
 737                         case 'R':
 738                         {
 739                                 PyObject *obj = va_arg(count, PyObject *);
 740                                 PyObject *repr;
 741                                 assert(obj);
 742                                 repr = PyObject_Repr(obj);
 743                                 if (!repr)
 744                                         goto fail;
 745                                 n += PyUnicode_GET_SIZE(repr);
 746                                 /* Remember the repr and switch to the next slot */
 747                                 *callresult++ = repr;
 748                                 break;
 749                         }
 750                         case 'p':
 751                                 (void) va_arg(count, int);
 752                                 /* maximum 64-bit pointer representation:
 753                                  * 0xffffffffffffffff
 754                                  * so 19 characters is enough.
 755                                  * XXX I count 18 -- what's the extra for?
 756                                  */
 757                                 n += 19;
 758                                 break;
 759                         default:
 760                                 /* if we stumble upon an unknown
 761                                    formatting code, copy the rest of
 762                                    the format string to the output
 763                                    string. (we cannot just skip the
 764                                    code, since there's no way to know
 765                                    what's in the argument list) */
 766                                 n += strlen(p);
 767                                 goto expand;
 768                         }
 769                 } else
 770                         n++;
 771         }
 772  expand:
 773         if (abuffersize > 20) {
 774                 abuffer = PyObject_Malloc(abuffersize);
 775                 if (!abuffer) {
 776                         PyErr_NoMemory();
 777                         goto fail;
 778                 }
 779                 realbuffer = abuffer;
 780         }
 781         else
 782                 realbuffer = buffer;
 783         /* step 4: fill the buffer */
 784         /* Since we've analyzed how much space we need for the worst case,
 785            we don't have to resize the string.
 786            There can be no errors beyond this point. */
 787         string = PyUnicode_FromUnicode(NULL, n);
 788         if (!string)
 789                 goto fail;
 790
 791         s = PyUnicode_AS_UNICODE(string);
 792         callresult = callresults;
 793
 794         for (f = format; *f; f++) {
 795                 if (*f == '%') {
 796                         const char* p = f++;
 797                         int longflag = 0;
 798                         int size_tflag = 0;
 799                         zeropad = (*f == '0');
 800                         /* parse the width.precision part */
 801                         width = 0;
 802                         while (isdigit((unsigned)*f))
 803                                 width = (width*10) + *f++ - '0';
 804                         precision = 0;
 805                         if (*f == '.') {
 806                                 f++;
 807                                 while (isdigit((unsigned)*f))
 808                                         precision = (precision*10) + *f++ - '0';
 809                         }
 810                         /* handle the long flag, but only for %ld and %lu.
 811                            others can be added when necessary. */
 812                         if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 813                                 longflag = 1;
 814                                 ++f;
 815                         }
 816                         /* handle the size_t flag. */
 817                         if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 818                                 size_tflag = 1;
 819                                 ++f;
 820                         }
 821
 822                         switch (*f) {
 823                         case 'c':
 824                                 *s++ = va_arg(vargs, int);
 825                                 break;
 826                         case 'd':
 827                                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 828                                 if (longflag)
 829                                         sprintf(realbuffer, fmt, va_arg(vargs, long));
 830                                 else if (size_tflag)
 831                                         sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 832                                 else
 833                                         sprintf(realbuffer, fmt, va_arg(vargs, int));
 834                                 appendstring(realbuffer);
 835                                 break;
 836                         case 'u':
 837                                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 838                                 if (longflag)
 839                                         sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 840                                 else if (size_tflag)
 841                                         sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 842                                 else
 843                                         sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 844                                 appendstring(realbuffer);
 845                                 break;
 846                         case 'i':
 847                                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 848                                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 849                                 appendstring(realbuffer);
 850                                 break;
 851                         case 'x':
 852                                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 853                                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 854                                 appendstring(realbuffer);
 855                                 break;
 856                         case 's':
 857                         {
 858                                 /* Parameter must be UTF-8 encoded.
 859                                    In case of encoding errors, use
 860                                    the replacement character. */
 861                                 PyObject *u;
 862                                 p = va_arg(vargs, char*);
 863                                 u = PyUnicode_DecodeUTF8(p, strlen(p),
 864                                                          "replace");
 865                                 if (!u)
 866                                         goto fail;
 867                                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
 868                                                 PyUnicode_GET_SIZE(u));
 869                                 s += PyUnicode_GET_SIZE(u);
 870                                 Py_DECREF(u);
 871                                 break;
 872                         }
 873                         case 'U':
 874                         {
 875                                 PyObject *obj = va_arg(vargs, PyObject *);
 876                                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 877                                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 878                                 s += size;
 879                                 break;
 880                         }
 881                         case 'V':
 882                         {
 883                                 PyObject *obj = va_arg(vargs, PyObject *);
 884                                 const char *str = va_arg(vargs, const char *);
 885                                 if (obj) {
 886                                         Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 887                                         Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 888                                         s += size;
 889                                 } else {
 890                                         appendstring(str);
 891                                 }
 892                                 break;
 893                         }
 894                         case 'S':
 895                         case 'R':
 896                         {
 897                                 Py_UNICODE *ucopy;
 898                                 Py_ssize_t usize;
 899                                 Py_ssize_t upos;
 900                                 /* unused, since we already have the result */
 901                                 (void) va_arg(vargs, PyObject *);
 902                                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 903                                 usize = PyUnicode_GET_SIZE(*callresult);
 904                                 for (upos = 0; upos<usize;)
 905                                         *s++ = ucopy[upos++];
 906                                 /* We're done with the unicode()/repr() => forget it */
 907                                 Py_DECREF(*callresult);
 908                                 /* switch to next unicode()/repr() result */
 909                                 ++callresult;
 910                                 break;
 911                         }
 912                         case 'p':
 913                                 sprintf(buffer, "%p", va_arg(vargs, void*));
 914                                 /* %p is ill-defined:  ensure leading 0x. */
 915                                 if (buffer[1] == 'X')
 916                                         buffer[1] = 'x';
 917                                 else if (buffer[1] != 'x') {
 918                                         memmove(buffer+2, buffer, strlen(buffer)+1);
 919                                         buffer[0] = '0';
 920                                         buffer[1] = 'x';
 921                                 }
 922                                 appendstring(buffer);
 923                                 break;
 924                         case '%':
 925                                 *s++ = '%';
 926                                 break;
 927                         default:
 928                                 appendstring(p);
 929                                 goto end;
 930                         }
 931                 } else
 932                         *s++ = *f;
 933         }
 934
 935  end:
 936         if (callresults)
 937                 PyObject_Free(callresults);
 938         if (abuffer)
 939                 PyObject_Free(abuffer);
 940         _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 941         return string;
 942  fail:
 943         if (callresults) {
 944                 PyObject **callresult2 = callresults;
 945                 while (callresult2 < callresult) {
 946                         Py_DECREF(*callresult2);
 947                         ++callresult2;
 948                 }
 949                 PyObject_Free(callresults);
 950         }
 951         if (abuffer)
 952                 PyObject_Free(abuffer);
 953         return NULL;
 954 }
 955
 956 #undef appendstring
 957
 958 PyObject *
 959 PyUnicode_FromFormat(const char *format, ...)
 960 {
 961         PyObject* ret;
 962         va_list vargs;
 963
 964 #ifdef HAVE_STDARG_PROTOTYPES
 965         va_start(vargs, format);
 966 #else
 967         va_start(vargs);
 968 #endif
 969         ret = PyUnicode_FromFormatV(format, vargs);
 970         va_end(vargs);
 971         return ret;
 972 }
 973
 974 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 975                                 wchar_t *w,
 976                                 Py_ssize_t size)
 977 {
 978     if (unicode == NULL) {
 979         PyErr_BadInternalCall();
 980         return -1;
 981     }
 982
 983     /* If possible, try to copy the 0-termination as well */
 984     if (size > PyUnicode_GET_SIZE(unicode))
 985         size = PyUnicode_GET_SIZE(unicode) + 1;
 986
 987 #ifdef HAVE_USABLE_WCHAR_T
 988     memcpy(w, unicode->str, size * sizeof(wchar_t));
 989 #else
 990     {
 991         register Py_UNICODE *u;
 992         register Py_ssize_t i;
 993         u = PyUnicode_AS_UNICODE(unicode);
 994         for (i = size; i > 0; i--)
 995             *w++ = *u++;
 996     }
 997 #endif
 998
 999     if (size > PyUnicode_GET_SIZE(unicode))
1000         return PyUnicode_GET_SIZE(unicode);
1001     else
1002     return size;
1003 }
1004
1005 #endif
1006
1007 PyObject *PyUnicode_FromOrdinal(int ordinal)
1008 {
1009     Py_UNICODE s[1];
1010
1011 #ifdef Py_UNICODE_WIDE
1012     if (ordinal < 0 || ordinal > 0x10ffff) {
1013         PyErr_SetString(PyExc_ValueError,
1014                         "unichr() arg not in range(0x110000) "
1015                         "(wide Python build)");
1016         return NULL;
1017     }
1018 #else
1019     if (ordinal < 0 || ordinal > 0xffff) {
1020         PyErr_SetString(PyExc_ValueError,
1021                         "unichr() arg not in range(0x10000) "
1022                         "(narrow Python build)");
1023         return NULL;
1024     }
1025 #endif
1026
1027     s[0] = (Py_UNICODE)ordinal;
1028     return PyUnicode_FromUnicode(s, 1);
1029 }
1030
1031 PyObject *PyUnicode_FromObject(register PyObject *obj)
1032 {
1033     /* XXX Perhaps we should make this API an alias of
1034            PyObject_Unicode() instead ?! */
1035     if (PyUnicode_CheckExact(obj)) {
1036         Py_INCREF(obj);
1037         return obj;
1038     }
1039     if (PyUnicode_Check(obj)) {
1040         /* For a Unicode subtype that's not a Unicode object,
1041            return a true Unicode object with the same data. */
1042         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1043                                      PyUnicode_GET_SIZE(obj));
1044     }
1045     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1046 }
1047
1048 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1049                                       const char *encoding,
1050                                       const char *errors)
1051 {
1052     const char *s = NULL;
1053     Py_ssize_t len;
1054     PyObject *v;
1055
1056     if (obj == NULL) {
1057         PyErr_BadInternalCall();
1058         return NULL;
1059     }
1060
1061 #if 0
1062     /* For b/w compatibility we also accept Unicode objects provided
1063        that no encodings is given and then redirect to
1064        PyObject_Unicode() which then applies the additional logic for
1065        Unicode subclasses.
1066
1067        NOTE: This API should really only be used for object which
1068              represent *encoded* Unicode !
1069
1070     */
1071         if (PyUnicode_Check(obj)) {
1072             if (encoding) {
1073                 PyErr_SetString(PyExc_TypeError,
1074                                 "decoding Unicode is not supported");
1075             return NULL;
1076             }
1077         return PyObject_Unicode(obj);
1078             }
1079 #else
1080     if (PyUnicode_Check(obj)) {
1081         PyErr_SetString(PyExc_TypeError,
1082                         "decoding Unicode is not supported");
1083         return NULL;
1084         }
1085 #endif
1086
1087     /* Coerce object */
1088     if (PyString_Check(obj)) {
1089             s = PyString_AS_STRING(obj);
1090             len = PyString_GET_SIZE(obj);
1091     }
1092     else if (PyByteArray_Check(obj)) {
1093         /* Python 2.x specific */
1094         PyErr_Format(PyExc_TypeError,
1095                      "decoding bytearray is not supported");
1096         return NULL;
1097     }
1098     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1099         /* Overwrite the error message with something more useful in
1100            case of a TypeError. */
1101         if (PyErr_ExceptionMatches(PyExc_TypeError))
1102         PyErr_Format(PyExc_TypeError,
1103                          "coercing to Unicode: need string or buffer, "
1104                          "%.80s found",
1105                      Py_TYPE(obj)->tp_name);
1106         goto onError;
1107     }
1108
1109     /* Convert to Unicode */
1110     if (len == 0) {
1111         Py_INCREF(unicode_empty);
1112         v = (PyObject *)unicode_empty;
1113     }
1114     else
1115         v = PyUnicode_Decode(s, len, encoding, errors);
1116
1117     return v;
1118
1119  onError:
1120     return NULL;
1121 }
1122
1123 PyObject *PyUnicode_Decode(const char *s,
1124                            Py_ssize_t size,
1125                            const char *encoding,
1126                            const char *errors)
1127 {
1128     PyObject *buffer = NULL, *unicode;
1129
1130     if (encoding == NULL)
1131         encoding = PyUnicode_GetDefaultEncoding();
1132
1133     /* Shortcuts for common default encodings */
1134     if (strcmp(encoding, "utf-8") == 0)
1135         return PyUnicode_DecodeUTF8(s, size, errors);
1136     else if (strcmp(encoding, "latin-1") == 0)
1137         return PyUnicode_DecodeLatin1(s, size, errors);
1138 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1139     else if (strcmp(encoding, "mbcs") == 0)
1140         return PyUnicode_DecodeMBCS(s, size, errors);
1141 #endif
1142     else if (strcmp(encoding, "ascii") == 0)
1143         return PyUnicode_DecodeASCII(s, size, errors);
1144
1145     /* Decode via the codec registry */
1146     buffer = PyBuffer_FromMemory((void *)s, size);
1147     if (buffer == NULL)
1148         goto onError;
1149     unicode = PyCodec_Decode(buffer, encoding, errors);
1150     if (unicode == NULL)
1151         goto onError;
1152     if (!PyUnicode_Check(unicode)) {
1153         PyErr_Format(PyExc_TypeError,
1154                      "decoder did not return an unicode object (type=%.400s)",
1155                      Py_TYPE(unicode)->tp_name);
1156         Py_DECREF(unicode);
1157         goto onError;
1158     }
1159     Py_DECREF(buffer);
1160     return unicode;
1161
1162  onError:
1163     Py_XDECREF(buffer);
1164     return NULL;
1165 }
1166
1167 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1168                                     const char *encoding,
1169                                     const char *errors)
1170 {
1171     PyObject *v;
1172
1173     if (!PyUnicode_Check(unicode)) {
1174         PyErr_BadArgument();
1175         goto onError;
1176     }
1177
1178     if (encoding == NULL)
1179         encoding = PyUnicode_GetDefaultEncoding();
1180
1181     /* Decode via the codec registry */
1182     v = PyCodec_Decode(unicode, encoding, errors);
1183     if (v == NULL)
1184         goto onError;
1185     return v;
1186
1187  onError:
1188     return NULL;
1189 }
1190
1191 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1192                            Py_ssize_t size,
1193                            const char *encoding,
1194                            const char *errors)
1195 {
1196     PyObject *v, *unicode;
1197
1198     unicode = PyUnicode_FromUnicode(s, size);
1199     if (unicode == NULL)
1200         return NULL;
1201     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1202     Py_DECREF(unicode);
1203     return v;
1204 }
1205
1206 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1207                                     const char *encoding,
1208                                     const char *errors)
1209 {
1210     PyObject *v;
1211
1212     if (!PyUnicode_Check(unicode)) {
1213         PyErr_BadArgument();
1214         goto onError;
1215     }
1216
1217     if (encoding == NULL)
1218         encoding = PyUnicode_GetDefaultEncoding();
1219
1220     /* Encode via the codec registry */
1221     v = PyCodec_Encode(unicode, encoding, errors);
1222     if (v == NULL)
1223         goto onError;
1224     return v;
1225
1226  onError:
1227     return NULL;
1228 }
1229
1230 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1231                                     const char *encoding,
1232                                     const char *errors)
1233 {
1234     PyObject *v;
1235
1236     if (!PyUnicode_Check(unicode)) {
1237         PyErr_BadArgument();
1238         goto onError;
1239     }
1240
1241     if (encoding == NULL)
1242         encoding = PyUnicode_GetDefaultEncoding();
1243
1244     /* Shortcuts for common default encodings */
1245     if (errors == NULL) {
1246         if (strcmp(encoding, "utf-8") == 0)
1247             return PyUnicode_AsUTF8String(unicode);
1248         else if (strcmp(encoding, "latin-1") == 0)
1249             return PyUnicode_AsLatin1String(unicode);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251         else if (strcmp(encoding, "mbcs") == 0)
1252             return PyUnicode_AsMBCSString(unicode);
1253 #endif
1254         else if (strcmp(encoding, "ascii") == 0)
1255             return PyUnicode_AsASCIIString(unicode);
1256     }
1257
1258     /* Encode via the codec registry */
1259     v = PyCodec_Encode(unicode, encoding, errors);
1260     if (v == NULL)
1261         goto onError;
1262     if (!PyString_Check(v)) {
1263         PyErr_Format(PyExc_TypeError,
1264                      "encoder did not return a string object (type=%.400s)",
1265                      Py_TYPE(v)->tp_name);
1266         Py_DECREF(v);
1267         goto onError;
1268     }
1269     return v;
1270
1271  onError:
1272     return NULL;
1273 }
1274
1275 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1276                                             const char *errors)
1277 {
1278     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1279
1280     if (v)
1281         return v;
1282     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1283     if (v && errors == NULL)
1284         ((PyUnicodeObject *)unicode)->defenc = v;
1285     return v;
1286 }
1287
1288 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1289 {
1290     if (!PyUnicode_Check(unicode)) {
1291         PyErr_BadArgument();
1292         goto onError;
1293     }
1294     return PyUnicode_AS_UNICODE(unicode);
1295
1296  onError:
1297     return NULL;
1298 }
1299
1300 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1301 {
1302     if (!PyUnicode_Check(unicode)) {
1303         PyErr_BadArgument();
1304         goto onError;
1305     }
1306     return PyUnicode_GET_SIZE(unicode);
1307
1308  onError:
1309     return -1;
1310 }
1311
1312 const char *PyUnicode_GetDefaultEncoding(void)
1313 {
1314     return unicode_default_encoding;
1315 }
1316
1317 int PyUnicode_SetDefaultEncoding(const char *encoding)
1318 {
1319     PyObject *v;
1320
1321     /* Make sure the encoding is valid. As side effect, this also
1322        loads the encoding into the codec registry cache. */
1323     v = _PyCodec_Lookup(encoding);
1324     if (v == NULL)
1325         goto onError;
1326     Py_DECREF(v);
1327     strncpy(unicode_default_encoding,
1328             encoding,
1329             sizeof(unicode_default_encoding));
1330     return 0;
1331
1332  onError:
1333     return -1;
1334 }
1335
1336 /* error handling callback helper:
1337    build arguments, call the callback and check the arguments,
1338    if no exception occurred, copy the replacement to the output
1339    and adjust various state variables.
1340    return 0 on success, -1 on error
1341 */
1342
1343 static
1344 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1345                  const char *encoding, const char *reason,
1346                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1347                  Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1348                  PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1349 {
1350     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1351
1352     PyObject *restuple = NULL;
1353     PyObject *repunicode = NULL;
1354     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1355     Py_ssize_t requiredsize;
1356     Py_ssize_t newpos;
1357     Py_UNICODE *repptr;
1358     Py_ssize_t repsize;
1359     int res = -1;
1360
1361     if (*errorHandler == NULL) {
1362         *errorHandler = PyCodec_LookupError(errors);
1363         if (*errorHandler == NULL)
1364            goto onError;
1365     }
1366
1367     if (*exceptionObject == NULL) {
1368         *exceptionObject = PyUnicodeDecodeError_Create(
1369             encoding, input, insize, *startinpos, *endinpos, reason);
1370         if (*exceptionObject == NULL)
1371            goto onError;
1372     }
1373     else {
1374         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1375             goto onError;
1376         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1377             goto onError;
1378         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1379             goto onError;
1380     }
1381
1382     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1383     if (restuple == NULL)
1384         goto onError;
1385     if (!PyTuple_Check(restuple)) {
1386         PyErr_Format(PyExc_TypeError, &argparse[4]);
1387         goto onError;
1388     }
1389     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1390         goto onError;
1391     if (newpos<0)
1392         newpos = insize+newpos;
1393     if (newpos<0 || newpos>insize) {
1394         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1395         goto onError;
1396     }
1397
1398     /* need more space? (at least enough for what we
1399        have+the replacement+the rest of the string (starting
1400        at the new input position), so we won't have to check space
1401        when there are no errors in the rest of the string) */
1402     repptr = PyUnicode_AS_UNICODE(repunicode);
1403     repsize = PyUnicode_GET_SIZE(repunicode);
1404     requiredsize = *outpos + repsize + insize-newpos;
1405     if (requiredsize > outsize) {
1406         if (requiredsize<2*outsize)
1407             requiredsize = 2*outsize;
1408         if (PyUnicode_Resize(output, requiredsize) < 0)
1409             goto onError;
1410         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1411     }
1412     *endinpos = newpos;
1413     *inptr = input + newpos;
1414     Py_UNICODE_COPY(*outptr, repptr, repsize);
1415     *outptr += repsize;
1416     *outpos += repsize;
1417     /* we made it! */
1418     res = 0;
1419
1420     onError:
1421     Py_XDECREF(restuple);
1422     return res;
1423 }
1424
1425 /* --- UTF-7 Codec -------------------------------------------------------- */
1426
1427 /* see RFC2152 for details */
1428
1429 static
1430 char utf7_special[128] = {
1431     /* indicate whether a UTF-7 character is special i.e. cannot be directly
1432        encoded:
1433            0 - not special
1434            1 - special
1435            2 - whitespace (optional)
1436            3 - RFC2152 Set O (optional) */
1437     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1438     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1439     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1440     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1441     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1442     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1443     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1445
1446 };
1447
1448 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1449    warnings about the comparison always being false; since
1450    utf7_special[0] is 1, we can safely make that one comparison
1451    true  */
1452
1453 #define SPECIAL(c, encodeO, encodeWS) \
1454     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1455      (encodeWS && (utf7_special[(c)] == 2)) || \
1456      (encodeO && (utf7_special[(c)] == 3)))
1457
1458 #define B64(n)  \
1459     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1460 #define B64CHAR(c) \
1461     (isalnum(c) || (c) == '+' || (c) == '/')
1462 #define UB64(c) \
1463     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1464      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1465
1466 #define ENCODE(out, ch, bits)                   \
1467     while (bits >= 6) {                         \
1468         *out++ = B64(ch >> (bits-6));           \
1469         bits -= 6;                              \
1470     }
1471
1472 #define DECODE(out, ch, bits, surrogate)                                \
1473     while (bits >= 16) {                                                \
1474         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1475         bits -= 16;                                                     \
1476         if (surrogate) {                                                \
1477             /* We have already generated an error for the high surrogate \
1478                so let's not bother seeing if the low surrogate is correct or not */ \
1479             surrogate = 0;                                              \
1480         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1481             /* This is a surrogate pair. Unfortunately we can't represent \
1482                it in a 16-bit character */                              \
1483             surrogate = 1;                                              \
1484             errmsg = "code pairs are not supported";                    \
1485             goto utf7Error;                                             \
1486         } else {                                                        \
1487             *out++ = outCh;                                             \
1488         }                                                               \
1489     }
1490
1491 PyObject *PyUnicode_DecodeUTF7(const char *s,
1492                                Py_ssize_t size,
1493                                const char *errors)
1494 {
1495     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1496 }
1497
1498 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1499                                Py_ssize_t size,
1500                                const char *errors,
1501                                Py_ssize_t *consumed)
1502 {
1503     const char *starts = s;
1504     Py_ssize_t startinpos;
1505     Py_ssize_t endinpos;
1506     Py_ssize_t outpos;
1507     const char *e;
1508     PyUnicodeObject *unicode;
1509     Py_UNICODE *p;
1510     const char *errmsg = "";
1511     int inShift = 0;
1512     unsigned int bitsleft = 0;
1513     unsigned long charsleft = 0;
1514     int surrogate = 0;
1515     PyObject *errorHandler = NULL;
1516     PyObject *exc = NULL;
1517
1518     unicode = _PyUnicode_New(size);
1519     if (!unicode)
1520         return NULL;
1521     if (size == 0) {
1522         if (consumed)
1523             *consumed = 0;
1524         return (PyObject *)unicode;
1525     }
1526
1527     p = unicode->str;
1528     e = s + size;
1529
1530     while (s < e) {
1531         Py_UNICODE ch;
1532         restart:
1533         ch = (unsigned char) *s;
1534
1535         if (inShift) {
1536             if ((ch == '-') || !B64CHAR(ch)) {
1537                 inShift = 0;
1538                 s++;
1539
1540                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1541                 if (bitsleft >= 6) {
1542                     /* The shift sequence has a partial character in it. If
1543                        bitsleft < 6 then we could just classify it as padding
1544                        but that is not the case here */
1545
1546                     errmsg = "partial character in shift sequence";
1547                     goto utf7Error;
1548                 }
1549                 /* According to RFC2152 the remaining bits should be zero. We
1550                    choose to signal an error/insert a replacement character
1551                    here so indicate the potential of a misencoded character. */
1552
1553                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1554                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1555                     errmsg = "non-zero padding bits in shift sequence";
1556                     goto utf7Error;
1557                 }
1558
1559                 if (ch == '-') {
1560                     if ((s < e) && (*(s) == '-')) {
1561                         *p++ = '-';
1562                         inShift = 1;
1563                     }
1564                 } else if (SPECIAL(ch,0,0)) {
1565                     errmsg = "unexpected special character";
1566                         goto utf7Error;
1567                 } else  {
1568                     *p++ = ch;
1569                 }
1570             } else {
1571                 charsleft = (charsleft << 6) | UB64(ch);
1572                 bitsleft += 6;
1573                 s++;
1574                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1575             }
1576         }
1577         else if ( ch == '+' ) {
1578             startinpos = s-starts;
1579             s++;
1580             if (s < e && *s == '-') {
1581                 s++;
1582                 *p++ = '+';
1583             } else
1584             {
1585                 inShift = 1;
1586                 bitsleft = 0;
1587             }
1588         }
1589         else if (SPECIAL(ch,0,0)) {
1590             startinpos = s-starts;
1591             errmsg = "unexpected special character";
1592             s++;
1593                 goto utf7Error;
1594         }
1595         else {
1596             *p++ = ch;
1597             s++;
1598         }
1599         continue;
1600     utf7Error:
1601         outpos = p-PyUnicode_AS_UNICODE(unicode);
1602         endinpos = s-starts;
1603         if (unicode_decode_call_errorhandler(
1604              errors, &errorHandler,
1605              "utf7", errmsg,
1606              starts, size, &startinpos, &endinpos, &exc, &s,
1607              (PyObject **)&unicode, &outpos, &p))
1608         goto onError;
1609     }
1610
1611     if (inShift && !consumed) {
1612         outpos = p-PyUnicode_AS_UNICODE(unicode);
1613         endinpos = size;
1614         if (unicode_decode_call_errorhandler(
1615              errors, &errorHandler,
1616              "utf7", "unterminated shift sequence",
1617              starts, size, &startinpos, &endinpos, &exc, &s,
1618              (PyObject **)&unicode, &outpos, &p))
1619             goto onError;
1620         if (s < e)
1621            goto restart;
1622     }
1623     if (consumed) {
1624         if(inShift)
1625             *consumed = startinpos;
1626         else
1627             *consumed = s-starts;
1628     }
1629
1630     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1631         goto onError;
1632
1633     Py_XDECREF(errorHandler);
1634     Py_XDECREF(exc);
1635     return (PyObject *)unicode;
1636
1637 onError:
1638     Py_XDECREF(errorHandler);
1639     Py_XDECREF(exc);
1640     Py_DECREF(unicode);
1641     return NULL;
1642 }
1643
1644
1645 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1646                    Py_ssize_t size,
1647                    int encodeSetO,
1648                    int encodeWhiteSpace,
1649                    const char *errors)
1650 {
1651     PyObject *v;
1652     /* It might be possible to tighten this worst case */
1653     Py_ssize_t cbAllocated = 5 * size;
1654     int inShift = 0;
1655     Py_ssize_t i = 0;
1656     unsigned int bitsleft = 0;
1657     unsigned long charsleft = 0;
1658     char * out;
1659     char * start;
1660
1661     if (cbAllocated / 5 != size)
1662         return PyErr_NoMemory();
1663
1664     if (size == 0)
1665                 return PyString_FromStringAndSize(NULL, 0);
1666
1667     v = PyString_FromStringAndSize(NULL, cbAllocated);
1668     if (v == NULL)
1669         return NULL;
1670
1671     start = out = PyString_AS_STRING(v);
1672     for (;i < size; ++i) {
1673         Py_UNICODE ch = s[i];
1674
1675         if (!inShift) {
1676             if (ch == '+') {
1677                 *out++ = '+';
1678                 *out++ = '-';
1679             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1680                 charsleft = ch;
1681                 bitsleft = 16;
1682                 *out++ = '+';
1683                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1684                 inShift = bitsleft > 0;
1685             } else {
1686                 *out++ = (char) ch;
1687             }
1688         } else {
1689             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1690                 *out++ = B64(charsleft << (6-bitsleft));
1691                 charsleft = 0;
1692                 bitsleft = 0;
1693                 /* Characters not in the BASE64 set implicitly unshift the sequence
1694                    so no '-' is required, except if the character is itself a '-' */
1695                 if (B64CHAR(ch) || ch == '-') {
1696                     *out++ = '-';
1697                 }
1698                 inShift = 0;
1699                 *out++ = (char) ch;
1700             } else {
1701                 bitsleft += 16;
1702                 charsleft = (charsleft << 16) | ch;
1703                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1704
1705                 /* If the next character is special then we dont' need to terminate
1706                    the shift sequence. If the next character is not a BASE64 character
1707                    or '-' then the shift sequence will be terminated implicitly and we
1708                    don't have to insert a '-'. */
1709
1710                 if (bitsleft == 0) {
1711                     if (i + 1 < size) {
1712                         Py_UNICODE ch2 = s[i+1];
1713
1714                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1715
1716                         } else if (B64CHAR(ch2) || ch2 == '-') {
1717                             *out++ = '-';
1718                             inShift = 0;
1719                         } else {
1720                             inShift = 0;
1721                         }
1722
1723                     }
1724                     else {
1725                         *out++ = '-';
1726                         inShift = 0;
1727                     }
1728                 }
1729             }
1730         }
1731     }
1732     if (bitsleft) {
1733         *out++= B64(charsleft << (6-bitsleft) );
1734         *out++ = '-';
1735     }
1736
1737     _PyString_Resize(&v, out - start);
1738     return v;
1739 }
1740
1741 #undef SPECIAL
1742 #undef B64
1743 #undef B64CHAR
1744 #undef UB64
1745 #undef ENCODE
1746 #undef DECODE
1747
1748 /* --- UTF-8 Codec -------------------------------------------------------- */
1749
1750 static
1751 char utf8_code_length[256] = {
1752     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1753        illegal prefix.  see RFC 2279 for details */
1754     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1763     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1764     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1767     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1768     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1769     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1770 };
1771
1772 PyObject *PyUnicode_DecodeUTF8(const char *s,
1773                                Py_ssize_t size,
1774                                const char *errors)
1775 {
1776     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1777 }
1778
1779 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1780                                         Py_ssize_t size,
1781                                         const char *errors,
1782                                         Py_ssize_t *consumed)
1783 {
1784     const char *starts = s;
1785     int n;
1786     Py_ssize_t startinpos;
1787     Py_ssize_t endinpos;
1788     Py_ssize_t outpos;
1789     const char *e;
1790     PyUnicodeObject *unicode;
1791     Py_UNICODE *p;
1792     const char *errmsg = "";
1793     PyObject *errorHandler = NULL;
1794     PyObject *exc = NULL;
1795
1796     /* Note: size will always be longer than the resulting Unicode
1797        character count */
1798     unicode = _PyUnicode_New(size);
1799     if (!unicode)
1800         return NULL;
1801     if (size == 0) {
1802         if (consumed)
1803             *consumed = 0;
1804         return (PyObject *)unicode;
1805     }
1806
1807     /* Unpack UTF-8 encoded data */
1808     p = unicode->str;
1809     e = s + size;
1810
1811     while (s < e) {
1812         Py_UCS4 ch = (unsigned char)*s;
1813
1814         if (ch < 0x80) {
1815             *p++ = (Py_UNICODE)ch;
1816             s++;
1817             continue;
1818         }
1819
1820         n = utf8_code_length[ch];
1821
1822         if (s + n > e) {
1823             if (consumed)
1824                 break;
1825             else {
1826                 errmsg = "unexpected end of data";
1827                 startinpos = s-starts;
1828                 endinpos = size;
1829                 goto utf8Error;
1830             }
1831         }
1832
1833         switch (n) {
1834
1835         case 0:
1836             errmsg = "unexpected code byte";
1837             startinpos = s-starts;
1838             endinpos = startinpos+1;
1839             goto utf8Error;
1840
1841         case 1:
1842             errmsg = "internal error";
1843             startinpos = s-starts;
1844             endinpos = startinpos+1;
1845             goto utf8Error;
1846
1847         case 2:
1848             if ((s[1] & 0xc0) != 0x80) {
1849                 errmsg = "invalid data";
1850                 startinpos = s-starts;
1851                 endinpos = startinpos+2;
1852                 goto utf8Error;
1853             }
1854             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1855             if (ch < 0x80) {
1856                 startinpos = s-starts;
1857                 endinpos = startinpos+2;
1858                 errmsg = "illegal encoding";
1859                 goto utf8Error;
1860             }
1861             else
1862                 *p++ = (Py_UNICODE)ch;
1863             break;
1864
1865         case 3:
1866             if ((s[1] & 0xc0) != 0x80 ||
1867                 (s[2] & 0xc0) != 0x80) {
1868                 errmsg = "invalid data";
1869                 startinpos = s-starts;
1870                 endinpos = startinpos+3;
1871                 goto utf8Error;
1872             }
1873             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1874             if (ch < 0x0800) {
1875                 /* Note: UTF-8 encodings of surrogates are considered
1876                    legal UTF-8 sequences;
1877
1878                    XXX For wide builds (UCS-4) we should probably try
1879                        to recombine the surrogates into a single code
1880                        unit.
1881                 */
1882                 errmsg = "illegal encoding";
1883                 startinpos = s-starts;
1884                 endinpos = startinpos+3;
1885                 goto utf8Error;
1886             }
1887             else
1888                 *p++ = (Py_UNICODE)ch;
1889             break;
1890
1891         case 4:
1892             if ((s[1] & 0xc0) != 0x80 ||
1893                 (s[2] & 0xc0) != 0x80 ||
1894                 (s[3] & 0xc0) != 0x80) {
1895                 errmsg = "invalid data";
1896                 startinpos = s-starts;
1897                 endinpos = startinpos+4;
1898                 goto utf8Error;
1899             }
1900             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1901                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1902             /* validate and convert to UTF-16 */
1903             if ((ch < 0x10000)        /* minimum value allowed for 4
1904                                          byte encoding */
1905                 || (ch > 0x10ffff))   /* maximum value allowed for
1906                                          UTF-16 */
1907             {
1908                 errmsg = "illegal encoding";
1909                 startinpos = s-starts;
1910                 endinpos = startinpos+4;
1911                 goto utf8Error;
1912             }
1913 #ifdef Py_UNICODE_WIDE
1914             *p++ = (Py_UNICODE)ch;
1915 #else
1916             /*  compute and append the two surrogates: */
1917
1918             /*  translate from 10000..10FFFF to 0..FFFF */
1919             ch -= 0x10000;
1920
1921             /*  high surrogate = top 10 bits added to D800 */
1922             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1923
1924             /*  low surrogate = bottom 10 bits added to DC00 */
1925             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1926 #endif
1927             break;
1928
1929         default:
1930             /* Other sizes are only needed for UCS-4 */
1931             errmsg = "unsupported Unicode code range";
1932             startinpos = s-starts;
1933             endinpos = startinpos+n;
1934             goto utf8Error;
1935         }
1936         s += n;
1937         continue;
1938
1939     utf8Error:
1940     outpos = p-PyUnicode_AS_UNICODE(unicode);
1941     if (unicode_decode_call_errorhandler(
1942              errors, &errorHandler,
1943              "utf8", errmsg,
1944              starts, size, &startinpos, &endinpos, &exc, &s,
1945              (PyObject **)&unicode, &outpos, &p))
1946         goto onError;
1947     }
1948     if (consumed)
1949         *consumed = s-starts;
1950
1951     /* Adjust length */
1952     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1953         goto onError;
1954
1955     Py_XDECREF(errorHandler);
1956     Py_XDECREF(exc);
1957     return (PyObject *)unicode;
1958
1959 onError:
1960     Py_XDECREF(errorHandler);
1961     Py_XDECREF(exc);
1962     Py_DECREF(unicode);
1963     return NULL;
1964 }
1965
1966 /* Allocation strategy:  if the string is short, convert into a stack buffer
1967    and allocate exactly as much space needed at the end.  Else allocate the
1968    maximum possible needed (4 result bytes per Unicode character), and return
1969    the excess memory at the end.
1970 */
1971 PyObject *
1972 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1973                      Py_ssize_t size,
1974                      const char *errors)
1975 {
1976 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1977
1978     Py_ssize_t i;           /* index into s of next input byte */
1979     PyObject *v;        /* result string object */
1980     char *p;            /* next free byte in output buffer */
1981     Py_ssize_t nallocated;  /* number of result bytes allocated */
1982     Py_ssize_t nneeded;        /* number of result bytes needed */
1983     char stackbuf[MAX_SHORT_UNICHARS * 4];
1984
1985     assert(s != NULL);
1986     assert(size >= 0);
1987
1988     if (size <= MAX_SHORT_UNICHARS) {
1989         /* Write into the stack buffer; nallocated can't overflow.
1990          * At the end, we'll allocate exactly as much heap space as it
1991          * turns out we need.
1992          */
1993         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1994         v = NULL;   /* will allocate after we're done */
1995         p = stackbuf;
1996     }
1997     else {
1998         /* Overallocate on the heap, and give the excess back at the end. */
1999         nallocated = size * 4;
2000         if (nallocated / 4 != size)  /* overflow! */
2001             return PyErr_NoMemory();
2002         v = PyString_FromStringAndSize(NULL, nallocated);
2003         if (v == NULL)
2004             return NULL;
2005         p = PyString_AS_STRING(v);
2006     }
2007
2008     for (i = 0; i < size;) {
2009         Py_UCS4 ch = s[i++];
2010
2011         if (ch < 0x80)
2012             /* Encode ASCII */
2013             *p++ = (char) ch;
2014
2015         else if (ch < 0x0800) {
2016             /* Encode Latin-1 */
2017             *p++ = (char)(0xc0 | (ch >> 6));
2018             *p++ = (char)(0x80 | (ch & 0x3f));
2019         }
2020         else {
2021             /* Encode UCS2 Unicode ordinals */
2022             if (ch < 0x10000) {
2023                 /* Special case: check for high surrogate */
2024                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2025                     Py_UCS4 ch2 = s[i];
2026                     /* Check for low surrogate and combine the two to
2027                        form a UCS4 value */
2028                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2029                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2030                         i++;
2031                         goto encodeUCS4;
2032                     }
2033                     /* Fall through: handles isolated high surrogates */
2034                 }
2035                 *p++ = (char)(0xe0 | (ch >> 12));
2036                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2037                 *p++ = (char)(0x80 | (ch & 0x3f));
2038                 continue;
2039             }
2040 encodeUCS4:
2041             /* Encode UCS4 Unicode ordinals */
2042             *p++ = (char)(0xf0 | (ch >> 18));
2043             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2044             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2045             *p++ = (char)(0x80 | (ch & 0x3f));
2046         }
2047     }
2048
2049     if (v == NULL) {
2050         /* This was stack allocated. */
2051         nneeded = p - stackbuf;
2052         assert(nneeded <= nallocated);
2053         v = PyString_FromStringAndSize(stackbuf, nneeded);
2054     }
2055     else {
2056         /* Cut back to size actually needed. */
2057         nneeded = p - PyString_AS_STRING(v);
2058         assert(nneeded <= nallocated);
2059         _PyString_Resize(&v, nneeded);
2060     }
2061     return v;
2062
2063 #undef MAX_SHORT_UNICHARS
2064 }
2065
2066 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2067 {
2068     if (!PyUnicode_Check(unicode)) {
2069         PyErr_BadArgument();
2070         return NULL;
2071     }
2072     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2073                                 PyUnicode_GET_SIZE(unicode),
2074                                 NULL);
2075 }
2076
2077 /* --- UTF-32 Codec ------------------------------------------------------- */
2078
2079 PyObject *
2080 PyUnicode_DecodeUTF32(const char *s,
2081                       Py_ssize_t size,
2082                       const char *errors,
2083                       int *byteorder)
2084 {
2085     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2086 }
2087
2088 PyObject *
2089 PyUnicode_DecodeUTF32Stateful(const char *s,
2090                               Py_ssize_t size,
2091                               const char *errors,
2092                               int *byteorder,
2093                               Py_ssize_t *consumed)
2094 {
2095     const char *starts = s;
2096     Py_ssize_t startinpos;
2097     Py_ssize_t endinpos;
2098     Py_ssize_t outpos;
2099     PyUnicodeObject *unicode;
2100     Py_UNICODE *p;
2101 #ifndef Py_UNICODE_WIDE
2102     int i, pairs;
2103 #else
2104     const int pairs = 0;
2105 #endif
2106     const unsigned char *q, *e;
2107     int bo = 0;       /* assume native ordering by default */
2108     const char *errmsg = "";
2109     /* Offsets from q for retrieving bytes in the right order. */
2110 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2111     int iorder[] = {0, 1, 2, 3};
2112 #else
2113     int iorder[] = {3, 2, 1, 0};
2114 #endif
2115     PyObject *errorHandler = NULL;
2116     PyObject *exc = NULL;
2117     /* On narrow builds we split characters outside the BMP into two
2118        codepoints => count how much extra space we need. */
2119 #ifndef Py_UNICODE_WIDE
2120     for (i = pairs = 0; i < size/4; i++)
2121         if (((Py_UCS4 *)s)[i] >= 0x10000)
2122             pairs++;
2123 #endif
2124
2125     /* This might be one to much, because of a BOM */
2126     unicode = _PyUnicode_New((size+3)/4+pairs);
2127     if (!unicode)
2128         return NULL;
2129     if (size == 0)
2130         return (PyObject *)unicode;
2131
2132     /* Unpack UTF-32 encoded data */
2133     p = unicode->str;
2134     q = (unsigned char *)s;
2135     e = q + size;
2136
2137     if (byteorder)
2138         bo = *byteorder;
2139
2140     /* Check for BOM marks (U+FEFF) in the input and adjust current
2141        byte order setting accordingly. In native mode, the leading BOM
2142        mark is skipped, in all other modes, it is copied to the output
2143        stream as-is (giving a ZWNBSP character). */
2144     if (bo == 0) {
2145         if (size >= 4) {
2146             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2147                                 (q[iorder[1]] << 8) | q[iorder[0]];
2148 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2149             if (bom == 0x0000FEFF) {
2150                 q += 4;
2151                 bo = -1;
2152             }
2153             else if (bom == 0xFFFE0000) {
2154                 q += 4;
2155                 bo = 1;
2156             }
2157 #else
2158             if (bom == 0x0000FEFF) {
2159                 q += 4;
2160                 bo = 1;
2161             }
2162             else if (bom == 0xFFFE0000) {
2163                 q += 4;
2164                 bo = -1;
2165             }
2166 #endif
2167         }
2168     }
2169
2170     if (bo == -1) {
2171         /* force LE */
2172         iorder[0] = 0;
2173         iorder[1] = 1;
2174         iorder[2] = 2;
2175         iorder[3] = 3;
2176     }
2177     else if (bo == 1) {
2178         /* force BE */
2179         iorder[0] = 3;
2180         iorder[1] = 2;
2181         iorder[2] = 1;
2182         iorder[3] = 0;
2183     }
2184
2185     while (q < e) {
2186         Py_UCS4 ch;
2187         /* remaining bytes at the end? (size should be divisible by 4) */
2188         if (e-q<4) {
2189             if (consumed)
2190                 break;
2191             errmsg = "truncated data";
2192             startinpos = ((const char *)q)-starts;
2193             endinpos = ((const char *)e)-starts;
2194             goto utf32Error;
2195             /* The remaining input chars are ignored if the callback
2196                chooses to skip the input */
2197         }
2198         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2199              (q[iorder[1]] << 8) | q[iorder[0]];
2200
2201         if (ch >= 0x110000)
2202         {
2203             errmsg = "codepoint not in range(0x110000)";
2204             startinpos = ((const char *)q)-starts;
2205             endinpos = startinpos+4;
2206             goto utf32Error;
2207         }
2208 #ifndef Py_UNICODE_WIDE
2209         if (ch >= 0x10000)
2210         {
2211             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2212             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2213         }
2214         else
2215 #endif
2216             *p++ = ch;
2217         q += 4;
2218         continue;
2219     utf32Error:
2220         outpos = p-PyUnicode_AS_UNICODE(unicode);
2221     if (unicode_decode_call_errorhandler(
2222          errors, &errorHandler,
2223          "utf32", errmsg,
2224          starts, size, &startinpos, &endinpos, &exc, &s,
2225          (PyObject **)&unicode, &outpos, &p))
2226             goto onError;
2227     }
2228
2229     if (byteorder)
2230         *byteorder = bo;
2231
2232     if (consumed)
2233         *consumed = (const char *)q-starts;
2234
2235     /* Adjust length */
2236     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2237         goto onError;
2238
2239     Py_XDECREF(errorHandler);
2240     Py_XDECREF(exc);
2241     return (PyObject *)unicode;
2242
2243 onError:
2244     Py_DECREF(unicode);
2245     Py_XDECREF(errorHandler);
2246     Py_XDECREF(exc);
2247     return NULL;
2248 }
2249
2250 PyObject *
2251 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2252                       Py_ssize_t size,
2253                       const char *errors,
2254                       int byteorder)
2255 {
2256     PyObject *v;
2257     unsigned char *p;
2258     Py_ssize_t nsize, bytesize;
2259 #ifndef Py_UNICODE_WIDE
2260     Py_ssize_t i, pairs;
2261 #else
2262     const int pairs = 0;
2263 #endif
2264     /* Offsets from p for storing byte pairs in the right order. */
2265 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2266     int iorder[] = {0, 1, 2, 3};
2267 #else
2268     int iorder[] = {3, 2, 1, 0};
2269 #endif
2270
2271 #define STORECHAR(CH)                       \
2272     do {                                    \
2273         p[iorder[3]] = ((CH) >> 24) & 0xff; \
2274         p[iorder[2]] = ((CH) >> 16) & 0xff; \
2275         p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2276         p[iorder[0]] = (CH) & 0xff;         \
2277         p += 4;                             \
2278     } while(0)
2279
2280     /* In narrow builds we can output surrogate pairs as one codepoint,
2281        so we need less space. */
2282 #ifndef Py_UNICODE_WIDE
2283     for (i = pairs = 0; i < size-1; i++)
2284         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2285             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2286             pairs++;
2287 #endif
2288     nsize = (size - pairs + (byteorder == 0));
2289     bytesize = nsize * 4;
2290     if (bytesize / 4 != nsize)
2291         return PyErr_NoMemory();
2292     v = PyString_FromStringAndSize(NULL, bytesize);
2293     if (v == NULL)
2294         return NULL;
2295
2296     p = (unsigned char *)PyString_AS_STRING(v);
2297     if (byteorder == 0)
2298         STORECHAR(0xFEFF);
2299     if (size == 0)
2300         return v;
2301
2302     if (byteorder == -1) {
2303         /* force LE */
2304         iorder[0] = 0;
2305         iorder[1] = 1;
2306         iorder[2] = 2;
2307         iorder[3] = 3;
2308     }
2309     else if (byteorder == 1) {
2310         /* force BE */
2311         iorder[0] = 3;
2312         iorder[1] = 2;
2313         iorder[2] = 1;
2314         iorder[3] = 0;
2315     }
2316
2317     while (size-- > 0) {
2318         Py_UCS4 ch = *s++;
2319 #ifndef Py_UNICODE_WIDE
2320         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2321             Py_UCS4 ch2 = *s;
2322             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2323                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2324                 s++;
2325                 size--;
2326             }
2327         }
2328 #endif
2329         STORECHAR(ch);
2330     }
2331     return v;
2332 #undef STORECHAR
2333 }
2334
2335 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2336 {
2337     if (!PyUnicode_Check(unicode)) {
2338         PyErr_BadArgument();
2339         return NULL;
2340     }
2341     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2342                                  PyUnicode_GET_SIZE(unicode),
2343                                  NULL,
2344                                  0);
2345 }
2346
2347 /* --- UTF-16 Codec ------------------------------------------------------- */
2348
2349 PyObject *
2350 PyUnicode_DecodeUTF16(const char *s,
2351                       Py_ssize_t size,
2352                       const char *errors,
2353                       int *byteorder)
2354 {
2355     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2356 }
2357
2358 PyObject *
2359 PyUnicode_DecodeUTF16Stateful(const char *s,
2360                               Py_ssize_t size,
2361                               const char *errors,
2362                               int *byteorder,
2363                               Py_ssize_t *consumed)
2364 {
2365     const char *starts = s;
2366     Py_ssize_t startinpos;
2367     Py_ssize_t endinpos;
2368     Py_ssize_t outpos;
2369     PyUnicodeObject *unicode;
2370     Py_UNICODE *p;
2371     const unsigned char *q, *e;
2372     int bo = 0;       /* assume native ordering by default */
2373     const char *errmsg = "";
2374     /* Offsets from q for retrieving byte pairs in the right order. */
2375 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2376     int ihi = 1, ilo = 0;
2377 #else
2378     int ihi = 0, ilo = 1;
2379 #endif
2380     PyObject *errorHandler = NULL;
2381     PyObject *exc = NULL;
2382
2383     /* Note: size will always be longer than the resulting Unicode
2384        character count */
2385     unicode = _PyUnicode_New(size);
2386     if (!unicode)
2387         return NULL;
2388     if (size == 0)
2389         return (PyObject *)unicode;
2390
2391     /* Unpack UTF-16 encoded data */
2392     p = unicode->str;
2393     q = (unsigned char *)s;
2394     e = q + size;
2395
2396     if (byteorder)
2397         bo = *byteorder;
2398
2399     /* Check for BOM marks (U+FEFF) in the input and adjust current
2400        byte order setting accordingly. In native mode, the leading BOM
2401        mark is skipped, in all other modes, it is copied to the output
2402        stream as-is (giving a ZWNBSP character). */
2403     if (bo == 0) {
2404         if (size >= 2) {
2405             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2406 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2407             if (bom == 0xFEFF) {
2408                 q += 2;
2409                 bo = -1;
2410             }
2411             else if (bom == 0xFFFE) {
2412                 q += 2;
2413                 bo = 1;
2414             }
2415 #else
2416             if (bom == 0xFEFF) {
2417                 q += 2;
2418                 bo = 1;
2419             }
2420             else if (bom == 0xFFFE) {
2421                 q += 2;
2422                 bo = -1;
2423             }
2424 #endif
2425         }
2426     }
2427
2428     if (bo == -1) {
2429         /* force LE */
2430         ihi = 1;
2431         ilo = 0;
2432     }
2433     else if (bo == 1) {
2434         /* force BE */
2435         ihi = 0;
2436         ilo = 1;
2437     }
2438
2439     while (q < e) {
2440         Py_UNICODE ch;
2441         /* remaining bytes at the end? (size should be even) */
2442         if (e-q<2) {
2443             if (consumed)
2444                 break;
2445             errmsg = "truncated data";
2446             startinpos = ((const char *)q)-starts;
2447             endinpos = ((const char *)e)-starts;
2448             goto utf16Error;
2449             /* The remaining input chars are ignored if the callback
2450                chooses to skip the input */
2451         }
2452         ch = (q[ihi] << 8) | q[ilo];
2453
2454         q += 2;
2455
2456         if (ch < 0xD800 || ch > 0xDFFF) {
2457             *p++ = ch;
2458             continue;
2459         }
2460
2461         /* UTF-16 code pair: */
2462         if (q >= e) {
2463             errmsg = "unexpected end of data";
2464             startinpos = (((const char *)q)-2)-starts;
2465             endinpos = ((const char *)e)-starts;
2466             goto utf16Error;
2467         }
2468         if (0xD800 <= ch && ch <= 0xDBFF) {
2469             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2470             q += 2;
2471             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2472 #ifndef Py_UNICODE_WIDE
2473                 *p++ = ch;
2474                 *p++ = ch2;
2475 #else
2476                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2477 #endif
2478                 continue;
2479             }
2480             else {
2481                 errmsg = "illegal UTF-16 surrogate";
2482                 startinpos = (((const char *)q)-4)-starts;
2483                 endinpos = startinpos+2;
2484                 goto utf16Error;
2485             }
2486
2487         }
2488         errmsg = "illegal encoding";
2489         startinpos = (((const char *)q)-2)-starts;
2490         endinpos = startinpos+2;
2491         /* Fall through to report the error */
2492
2493     utf16Error:
2494         outpos = p-PyUnicode_AS_UNICODE(unicode);
2495         if (unicode_decode_call_errorhandler(
2496                  errors, &errorHandler,
2497                  "utf16", errmsg,
2498                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2499                  (PyObject **)&unicode, &outpos, &p))
2500             goto onError;
2501     }
2502
2503     if (byteorder)
2504         *byteorder = bo;
2505
2506     if (consumed)
2507         *consumed = (const char *)q-starts;
2508
2509     /* Adjust length */
2510     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2511         goto onError;
2512
2513     Py_XDECREF(errorHandler);
2514     Py_XDECREF(exc);
2515     return (PyObject *)unicode;
2516
2517 onError:
2518     Py_DECREF(unicode);
2519     Py_XDECREF(errorHandler);
2520     Py_XDECREF(exc);
2521     return NULL;
2522 }
2523
2524 PyObject *
2525 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2526                       Py_ssize_t size,
2527                       const char *errors,
2528                       int byteorder)
2529 {
2530     PyObject *v;
2531     unsigned char *p;
2532     Py_ssize_t nsize, bytesize;
2533 #ifdef Py_UNICODE_WIDE
2534     Py_ssize_t i, pairs;
2535 #else
2536     const int pairs = 0;
2537 #endif
2538     /* Offsets from p for storing byte pairs in the right order. */
2539 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2540     int ihi = 1, ilo = 0;
2541 #else
2542     int ihi = 0, ilo = 1;
2543 #endif
2544
2545 #define STORECHAR(CH)                   \
2546     do {                                \
2547         p[ihi] = ((CH) >> 8) & 0xff;    \
2548         p[ilo] = (CH) & 0xff;           \
2549         p += 2;                         \
2550     } while(0)
2551
2552 #ifdef Py_UNICODE_WIDE
2553     for (i = pairs = 0; i < size; i++)
2554         if (s[i] >= 0x10000)
2555             pairs++;
2556 #endif
2557     /* 2 * (size + pairs + (byteorder == 0)) */
2558     if (size > PY_SSIZE_T_MAX ||
2559         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2560         return PyErr_NoMemory();
2561     nsize = size + pairs + (byteorder == 0);
2562     bytesize = nsize * 2;
2563     if (bytesize / 2 != nsize)
2564         return PyErr_NoMemory();
2565     v = PyString_FromStringAndSize(NULL, bytesize);
2566     if (v == NULL)
2567         return NULL;
2568
2569     p = (unsigned char *)PyString_AS_STRING(v);
2570     if (byteorder == 0)
2571         STORECHAR(0xFEFF);
2572     if (size == 0)
2573         return v;
2574
2575     if (byteorder == -1) {
2576         /* force LE */
2577         ihi = 1;
2578         ilo = 0;
2579     }
2580     else if (byteorder == 1) {
2581         /* force BE */
2582         ihi = 0;
2583         ilo = 1;
2584     }
2585
2586     while (size-- > 0) {
2587         Py_UNICODE ch = *s++;
2588         Py_UNICODE ch2 = 0;
2589 #ifdef Py_UNICODE_WIDE
2590         if (ch >= 0x10000) {
2591             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2592             ch  = 0xD800 | ((ch-0x10000) >> 10);
2593         }
2594 #endif
2595         STORECHAR(ch);
2596         if (ch2)
2597             STORECHAR(ch2);
2598     }
2599     return v;
2600 #undef STORECHAR
2601 }
2602
2603 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2604 {
2605     if (!PyUnicode_Check(unicode)) {
2606         PyErr_BadArgument();
2607         return NULL;
2608     }
2609     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2610                                  PyUnicode_GET_SIZE(unicode),
2611                                  NULL,
2612                                  0);
2613 }
2614
2615 /* --- Unicode Escape Codec ----------------------------------------------- */
2616
2617 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2618
2619 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2620                                         Py_ssize_t size,
2621                                         const char *errors)
2622 {
2623     const char *starts = s;
2624     Py_ssize_t startinpos;
2625     Py_ssize_t endinpos;
2626     Py_ssize_t outpos;
2627     int i;
2628     PyUnicodeObject *v;
2629     Py_UNICODE *p;
2630     const char *end;
2631     char* message;
2632     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2633     PyObject *errorHandler = NULL;
2634     PyObject *exc = NULL;
2635
2636     /* Escaped strings will always be longer than the resulting
2637        Unicode string, so we start with size here and then reduce the
2638        length after conversion to the true value.
2639        (but if the error callback returns a long replacement string
2640        we'll have to allocate more space) */
2641     v = _PyUnicode_New(size);
2642     if (v == NULL)
2643         goto onError;
2644     if (size == 0)
2645         return (PyObject *)v;
2646
2647     p = PyUnicode_AS_UNICODE(v);
2648     end = s + size;
2649
2650     while (s < end) {
2651         unsigned char c;
2652         Py_UNICODE x;
2653         int digits;
2654
2655         /* Non-escape characters are interpreted as Unicode ordinals */
2656         if (*s != '\\') {
2657             *p++ = (unsigned char) *s++;
2658             continue;
2659         }
2660
2661         startinpos = s-starts;
2662         /* \ - Escapes */
2663         s++;
2664         c = *s++;
2665         if (s > end)
2666             c = '\0'; /* Invalid after \ */
2667         switch (c) {
2668
2669         /* \x escapes */
2670         case '\n': break;
2671         case '\\': *p++ = '\\'; break;
2672         case '\'': *p++ = '\''; break;
2673         case '\"': *p++ = '\"'; break;
2674         case 'b': *p++ = '\b'; break;
2675         case 'f': *p++ = '\014'; break; /* FF */
2676         case 't': *p++ = '\t'; break;
2677         case 'n': *p++ = '\n'; break;
2678         case 'r': *p++ = '\r'; break;
2679         case 'v': *p++ = '\013'; break; /* VT */
2680         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2681
2682         /* \OOO (octal) escapes */
2683         case '0': case '1': case '2': case '3':
2684         case '4': case '5': case '6': case '7':
2685             x = s[-1] - '0';
2686             if (s < end && '0' <= *s && *s <= '7') {
2687                 x = (x<<3) + *s++ - '0';
2688                 if (s < end && '0' <= *s && *s <= '7')
2689                     x = (x<<3) + *s++ - '0';
2690             }
2691             *p++ = x;
2692             break;
2693
2694         /* hex escapes */
2695         /* \xXX */
2696         case 'x':
2697             digits = 2;
2698             message = "truncated \\xXX escape";
2699             goto hexescape;
2700
2701         /* \uXXXX */
2702         case 'u':
2703             digits = 4;
2704             message = "truncated \\uXXXX escape";
2705             goto hexescape;
2706
2707         /* \UXXXXXXXX */
2708         case 'U':
2709             digits = 8;
2710             message = "truncated \\UXXXXXXXX escape";
2711         hexescape:
2712             chr = 0;
2713             outpos = p-PyUnicode_AS_UNICODE(v);
2714             if (s+digits>end) {
2715                 endinpos = size;
2716                 if (unicode_decode_call_errorhandler(
2717                     errors, &errorHandler,
2718                     "unicodeescape", "end of string in escape sequence",
2719                     starts, size, &startinpos, &endinpos, &exc, &s,
2720                     (PyObject **)&v, &outpos, &p))
2721                     goto onError;
2722                 goto nextByte;
2723             }
2724             for (i = 0; i < digits; ++i) {
2725                 c = (unsigned char) s[i];
2726                 if (!isxdigit(c)) {
2727                     endinpos = (s+i+1)-starts;
2728                     if (unicode_decode_call_errorhandler(
2729                         errors, &errorHandler,
2730                         "unicodeescape", message,
2731                         starts, size, &startinpos, &endinpos, &exc, &s,
2732                         (PyObject **)&v, &outpos, &p))
2733                         goto onError;
2734                     goto nextByte;
2735                 }
2736                 chr = (chr<<4) & ~0xF;
2737                 if (c >= '0' && c <= '9')
2738                     chr += c - '0';
2739                 else if (c >= 'a' && c <= 'f')
2740                     chr += 10 + c - 'a';
2741                 else
2742                     chr += 10 + c - 'A';
2743             }
2744             s += i;
2745             if (chr == 0xffffffff && PyErr_Occurred())
2746                 /* _decoding_error will have already written into the
2747                    target buffer. */
2748                 break;
2749         store:
2750             /* when we get here, chr is a 32-bit unicode character */
2751             if (chr <= 0xffff)
2752                 /* UCS-2 character */
2753                 *p++ = (Py_UNICODE) chr;
2754             else if (chr <= 0x10ffff) {
2755                 /* UCS-4 character. Either store directly, or as
2756                    surrogate pair. */
2757 #ifdef Py_UNICODE_WIDE
2758                 *p++ = chr;
2759 #else
2760                 chr -= 0x10000L;
2761                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2762                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2763 #endif
2764             } else {
2765                 endinpos = s-starts;
2766                 outpos = p-PyUnicode_AS_UNICODE(v);
2767                 if (unicode_decode_call_errorhandler(
2768                     errors, &errorHandler,
2769                     "unicodeescape", "illegal Unicode character",
2770                     starts, size, &startinpos, &endinpos, &exc, &s,
2771                     (PyObject **)&v, &outpos, &p))
2772                     goto onError;
2773             }
2774             break;
2775
2776         /* \N{name} */
2777         case 'N':
2778             message = "malformed \\N character escape";
2779             if (ucnhash_CAPI == NULL) {
2780                 /* load the unicode data module */
2781                 PyObject *m, *api;
2782                 m = PyImport_ImportModuleNoBlock("unicodedata");
2783                 if (m == NULL)
2784                     goto ucnhashError;
2785                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2786                 Py_DECREF(m);
2787                 if (api == NULL)
2788                     goto ucnhashError;
2789                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2790                 Py_DECREF(api);
2791                 if (ucnhash_CAPI == NULL)
2792                     goto ucnhashError;
2793             }
2794             if (*s == '{') {
2795                 const char *start = s+1;
2796                 /* look for the closing brace */
2797                 while (*s != '}' && s < end)
2798                     s++;
2799                 if (s > start && s < end && *s == '}') {
2800                     /* found a name.  look it up in the unicode database */
2801                     message = "unknown Unicode character name";
2802                     s++;
2803                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2804                         goto store;
2805                 }
2806             }
2807             endinpos = s-starts;
2808             outpos = p-PyUnicode_AS_UNICODE(v);
2809             if (unicode_decode_call_errorhandler(
2810                 errors, &errorHandler,
2811                 "unicodeescape", message,
2812                 starts, size, &startinpos, &endinpos, &exc, &s,
2813                 (PyObject **)&v, &outpos, &p))
2814                 goto onError;
2815             break;
2816
2817         default:
2818             if (s > end) {
2819                 message = "\\ at end of string";
2820                 s--;
2821                 endinpos = s-starts;
2822                 outpos = p-PyUnicode_AS_UNICODE(v);
2823                 if (unicode_decode_call_errorhandler(
2824                     errors, &errorHandler,
2825                     "unicodeescape", message,
2826                     starts, size, &startinpos, &endinpos, &exc, &s,
2827                     (PyObject **)&v, &outpos, &p))
2828                     goto onError;
2829             }
2830             else {
2831                 *p++ = '\\';
2832                 *p++ = (unsigned char)s[-1];
2833             }
2834             break;
2835         }
2836         nextByte:
2837         ;
2838     }
2839     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2840         goto onError;
2841     Py_XDECREF(errorHandler);
2842     Py_XDECREF(exc);
2843     return (PyObject *)v;
2844
2845 ucnhashError:
2846     PyErr_SetString(
2847         PyExc_UnicodeError,
2848         "\\N escapes not supported (can't load unicodedata module)"
2849         );
2850     Py_XDECREF(v);
2851     Py_XDECREF(errorHandler);
2852     Py_XDECREF(exc);
2853     return NULL;
2854
2855 onError:
2856     Py_XDECREF(v);
2857     Py_XDECREF(errorHandler);
2858     Py_XDECREF(exc);
2859     return NULL;
2860 }
2861
2862 /* Return a Unicode-Escape string version of the Unicode object.
2863
2864    If quotes is true, the string is enclosed in u"" or u'' quotes as
2865    appropriate.
2866
2867 */
2868
2869 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2870                                       Py_ssize_t size,
2871                                       Py_UNICODE ch)
2872 {
2873     /* like wcschr, but doesn't stop at NULL characters */
2874
2875     while (size-- > 0) {
2876         if (*s == ch)
2877             return s;
2878         s++;
2879     }
2880
2881     return NULL;
2882 }
2883
2884 static
2885 PyObject *unicodeescape_string(const Py_UNICODE *s,
2886                                Py_ssize_t size,
2887                                int quotes)
2888 {
2889     PyObject *repr;
2890     char *p;
2891
2892     static const char *hexdigit = "0123456789abcdef";
2893 #ifdef Py_UNICODE_WIDE
2894     const Py_ssize_t expandsize = 10;
2895 #else
2896     const Py_ssize_t expandsize = 6;
2897 #endif
2898
2899     /* XXX(nnorwitz): rather than over-allocating, it would be
2900        better to choose a different scheme.  Perhaps scan the
2901        first N-chars of the string and allocate based on that size.
2902     */
2903     /* Initial allocation is based on the longest-possible unichr
2904        escape.
2905
2906        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2907        unichr, so in this case it's the longest unichr escape. In
2908        narrow (UTF-16) builds this is five chars per source unichr
2909        since there are two unichrs in the surrogate pair, so in narrow
2910        (UTF-16) builds it's not the longest unichr escape.
2911
2912        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2913        so in the narrow (UTF-16) build case it's the longest unichr
2914        escape.
2915     */
2916
2917     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
2918         return PyErr_NoMemory();
2919
2920     repr = PyString_FromStringAndSize(NULL,
2921         2
2922         + expandsize*size
2923         + 1);
2924     if (repr == NULL)
2925         return NULL;
2926
2927     p = PyString_AS_STRING(repr);
2928
2929     if (quotes) {
2930         *p++ = 'u';
2931         *p++ = (findchar(s, size, '\'') &&
2932                 !findchar(s, size, '"')) ? '"' : '\'';
2933     }
2934     while (size-- > 0) {
2935         Py_UNICODE ch = *s++;
2936
2937         /* Escape quotes and backslashes */
2938         if ((quotes &&
2939              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2940             *p++ = '\\';
2941             *p++ = (char) ch;
2942             continue;
2943         }
2944
2945 #ifdef Py_UNICODE_WIDE
2946         /* Map 21-bit characters to '\U00xxxxxx' */
2947         else if (ch >= 0x10000) {
2948             *p++ = '\\';
2949             *p++ = 'U';
2950             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2951             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2952             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2953             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2954             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2955             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2956             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2957             *p++ = hexdigit[ch & 0x0000000F];
2958             continue;
2959         }
2960 #else
2961         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2962         else if (ch >= 0xD800 && ch < 0xDC00) {
2963             Py_UNICODE ch2;
2964             Py_UCS4 ucs;
2965
2966             ch2 = *s++;
2967             size--;
2968             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2969                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2970                 *p++ = '\\';
2971                 *p++ = 'U';
2972                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2973                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2974                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2975                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2976                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2977                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2978                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2979                 *p++ = hexdigit[ucs & 0x0000000F];
2980                 continue;
2981             }
2982             /* Fall through: isolated surrogates are copied as-is */
2983             s--;
2984             size++;
2985         }
2986 #endif
2987
2988         /* Map 16-bit characters to '\uxxxx' */
2989         if (ch >= 256) {
2990             *p++ = '\\';
2991             *p++ = 'u';
2992             *p++ = hexdigit[(ch >> 12) & 0x000F];
2993             *p++ = hexdigit[(ch >> 8) & 0x000F];
2994             *p++ = hexdigit[(ch >> 4) & 0x000F];
2995             *p++ = hexdigit[ch & 0x000F];
2996         }
2997
2998         /* Map special whitespace to '\t', \n', '\r' */
2999         else if (ch == '\t') {
3000             *p++ = '\\';
3001             *p++ = 't';
3002         }
3003         else if (ch == '\n') {
3004             *p++ = '\\';
3005             *p++ = 'n';
3006         }
3007         else if (ch == '\r') {
3008             *p++ = '\\';
3009             *p++ = 'r';
3010         }
3011
3012         /* Map non-printable US ASCII to '\xhh' */
3013         else if (ch < ' ' || ch >= 0x7F) {
3014             *p++ = '\\';
3015             *p++ = 'x';
3016             *p++ = hexdigit[(ch >> 4) & 0x000F];
3017             *p++ = hexdigit[ch & 0x000F];
3018         }
3019
3020         /* Copy everything else as-is */
3021         else
3022             *p++ = (char) ch;
3023     }
3024     if (quotes)
3025         *p++ = PyString_AS_STRING(repr)[1];
3026
3027     *p = '\0';
3028     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
3029     return repr;
3030 }
3031
3032 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3033                                         Py_ssize_t size)
3034 {
3035     return unicodeescape_string(s, size, 0);
3036 }
3037
3038 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3039 {
3040     if (!PyUnicode_Check(unicode)) {
3041         PyErr_BadArgument();
3042         return NULL;
3043     }
3044     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3045                                          PyUnicode_GET_SIZE(unicode));
3046 }
3047
3048 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3049
3050 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3051                                            Py_ssize_t size,
3052                                            const char *errors)
3053 {
3054     const char *starts = s;
3055     Py_ssize_t startinpos;
3056     Py_ssize_t endinpos;
3057     Py_ssize_t outpos;
3058     PyUnicodeObject *v;
3059     Py_UNICODE *p;
3060     const char *end;
3061     const char *bs;
3062     PyObject *errorHandler = NULL;
3063     PyObject *exc = NULL;
3064
3065     /* Escaped strings will always be longer than the resulting
3066        Unicode string, so we start with size here and then reduce the
3067        length after conversion to the true value. (But decoding error
3068        handler might have to resize the string) */
3069     v = _PyUnicode_New(size);
3070     if (v == NULL)
3071         goto onError;
3072     if (size == 0)
3073         return (PyObject *)v;
3074     p = PyUnicode_AS_UNICODE(v);
3075     end = s + size;
3076     while (s < end) {
3077         unsigned char c;
3078         Py_UCS4 x;
3079         int i;
3080         int count;
3081
3082         /* Non-escape characters are interpreted as Unicode ordinals */
3083         if (*s != '\\') {
3084             *p++ = (unsigned char)*s++;
3085             continue;
3086         }
3087         startinpos = s-starts;
3088
3089         /* \u-escapes are only interpreted iff the number of leading
3090            backslashes if odd */
3091         bs = s;
3092         for (;s < end;) {
3093             if (*s != '\\')
3094                 break;
3095             *p++ = (unsigned char)*s++;
3096         }
3097         if (((s - bs) & 1) == 0 ||
3098             s >= end ||
3099             (*s != 'u' && *s != 'U')) {
3100             continue;
3101         }
3102         p--;
3103         count = *s=='u' ? 4 : 8;
3104         s++;
3105
3106         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3107         outpos = p-PyUnicode_AS_UNICODE(v);
3108         for (x = 0, i = 0; i < count; ++i, ++s) {
3109             c = (unsigned char)*s;
3110             if (!isxdigit(c)) {
3111                 endinpos = s-starts;
3112                 if (unicode_decode_call_errorhandler(
3113                     errors, &errorHandler,
3114                     "rawunicodeescape", "truncated \\uXXXX",
3115                     starts, size, &startinpos, &endinpos, &exc, &s,
3116                     (PyObject **)&v, &outpos, &p))
3117                     goto onError;
3118                 goto nextByte;
3119             }
3120             x = (x<<4) & ~0xF;
3121             if (c >= '0' && c <= '9')
3122                 x += c - '0';
3123             else if (c >= 'a' && c <= 'f')
3124                 x += 10 + c - 'a';
3125             else
3126                 x += 10 + c - 'A';
3127         }
3128         if (x <= 0xffff)
3129                 /* UCS-2 character */
3130                 *p++ = (Py_UNICODE) x;
3131         else if (x <= 0x10ffff) {
3132                 /* UCS-4 character. Either store directly, or as
3133                    surrogate pair. */
3134 #ifdef Py_UNICODE_WIDE
3135                 *p++ = (Py_UNICODE) x;
3136 #else
3137                 x -= 0x10000L;
3138                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3139                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3140 #endif
3141         } else {
3142             endinpos = s-starts;
3143             outpos = p-PyUnicode_AS_UNICODE(v);
3144             if (unicode_decode_call_errorhandler(
3145                     errors, &errorHandler,
3146                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3147                     starts, size, &startinpos, &endinpos, &exc, &s,
3148                     (PyObject **)&v, &outpos, &p))
3149                     goto onError;
3150         }
3151         nextByte:
3152         ;
3153     }
3154     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3155         goto onError;
3156     Py_XDECREF(errorHandler);
3157     Py_XDECREF(exc);
3158     return (PyObject *)v;
3159
3160  onError:
3161     Py_XDECREF(v);
3162     Py_XDECREF(errorHandler);
3163     Py_XDECREF(exc);
3164     return NULL;
3165 }
3166
3167 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3168                                            Py_ssize_t size)
3169 {
3170     PyObject *repr;
3171     char *p;
3172     char *q;
3173
3174     static const char *hexdigit = "0123456789abcdef";
3175 #ifdef Py_UNICODE_WIDE
3176     const Py_ssize_t expandsize = 10;
3177 #else
3178     const Py_ssize_t expandsize = 6;
3179 #endif
3180
3181     if (size > PY_SSIZE_T_MAX / expandsize)
3182         return PyErr_NoMemory();
3183
3184     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3185     if (repr == NULL)
3186         return NULL;
3187     if (size == 0)
3188         return repr;
3189
3190     p = q = PyString_AS_STRING(repr);
3191     while (size-- > 0) {
3192         Py_UNICODE ch = *s++;
3193 #ifdef Py_UNICODE_WIDE
3194         /* Map 32-bit characters to '\Uxxxxxxxx' */
3195         if (ch >= 0x10000) {
3196             *p++ = '\\';
3197             *p++ = 'U';
3198             *p++ = hexdigit[(ch >> 28) & 0xf];
3199             *p++ = hexdigit[(ch >> 24) & 0xf];
3200             *p++ = hexdigit[(ch >> 20) & 0xf];
3201             *p++ = hexdigit[(ch >> 16) & 0xf];
3202             *p++ = hexdigit[(ch >> 12) & 0xf];
3203             *p++ = hexdigit[(ch >> 8) & 0xf];
3204             *p++ = hexdigit[(ch >> 4) & 0xf];
3205             *p++ = hexdigit[ch & 15];
3206         }
3207         else
3208 #else
3209         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3210         if (ch >= 0xD800 && ch < 0xDC00) {
3211             Py_UNICODE ch2;
3212             Py_UCS4 ucs;
3213
3214             ch2 = *s++;
3215             size--;
3216             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3217                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3218                 *p++ = '\\';
3219                 *p++ = 'U';
3220                 *p++ = hexdigit[(ucs >> 28) & 0xf];
3221                 *p++ = hexdigit[(ucs >> 24) & 0xf];
3222                 *p++ = hexdigit[(ucs >> 20) & 0xf];
3223                 *p++ = hexdigit[(ucs >> 16) & 0xf];
3224                 *p++ = hexdigit[(ucs >> 12) & 0xf];
3225                 *p++ = hexdigit[(ucs >> 8) & 0xf];
3226                 *p++ = hexdigit[(ucs >> 4) & 0xf];
3227                 *p++ = hexdigit[ucs & 0xf];
3228                 continue;
3229             }
3230             /* Fall through: isolated surrogates are copied as-is */
3231             s--;
3232             size++;
3233         }
3234 #endif
3235         /* Map 16-bit characters to '\uxxxx' */
3236         if (ch >= 256) {
3237             *p++ = '\\';
3238             *p++ = 'u';
3239             *p++ = hexdigit[(ch >> 12) & 0xf];
3240             *p++ = hexdigit[(ch >> 8) & 0xf];
3241             *p++ = hexdigit[(ch >> 4) & 0xf];
3242             *p++ = hexdigit[ch & 15];
3243         }
3244         /* Copy everything else as-is */
3245         else
3246             *p++ = (char) ch;
3247     }
3248     *p = '\0';
3249     _PyString_Resize(&repr, p - q);
3250     return repr;
3251 }
3252
3253 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3254 {
3255     if (!PyUnicode_Check(unicode)) {
3256         PyErr_BadArgument();
3257         return NULL;
3258     }
3259     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3260                                             PyUnicode_GET_SIZE(unicode));
3261 }
3262
3263 /* --- Unicode Internal Codec ------------------------------------------- */
3264
3265 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3266                                            Py_ssize_t size,
3267                                            const char *errors)
3268 {
3269     const char *starts = s;
3270     Py_ssize_t startinpos;
3271     Py_ssize_t endinpos;
3272     Py_ssize_t outpos;
3273     PyUnicodeObject *v;
3274     Py_UNICODE *p;
3275     const char *end;
3276     const char *reason;
3277     PyObject *errorHandler = NULL;
3278     PyObject *exc = NULL;
3279
3280 #ifdef Py_UNICODE_WIDE
3281     Py_UNICODE unimax = PyUnicode_GetMax();
3282 #endif
3283
3284     /* XXX overflow detection missing */
3285     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3286     if (v == NULL)
3287         goto onError;
3288     if (PyUnicode_GetSize((PyObject *)v) == 0)
3289         return (PyObject *)v;
3290     p = PyUnicode_AS_UNICODE(v);
3291     end = s + size;
3292
3293     while (s < end) {
3294         memcpy(p, s, sizeof(Py_UNICODE));
3295         /* We have to sanity check the raw data, otherwise doom looms for
3296            some malformed UCS-4 data. */
3297         if (
3298             #ifdef Py_UNICODE_WIDE
3299             *p > unimax || *p < 0 ||
3300             #endif
3301             end-s < Py_UNICODE_SIZE
3302             )
3303             {
3304             startinpos = s - starts;
3305             if (end-s < Py_UNICODE_SIZE) {
3306                 endinpos = end-starts;
3307                 reason = "truncated input";
3308             }
3309             else {
3310                 endinpos = s - starts + Py_UNICODE_SIZE;
3311                 reason = "illegal code point (> 0x10FFFF)";
3312             }
3313             outpos = p - PyUnicode_AS_UNICODE(v);
3314             if (unicode_decode_call_errorhandler(
3315                     errors, &errorHandler,
3316                     "unicode_internal", reason,
3317                     starts, size, &startinpos, &endinpos, &exc, &s,
3318                     (PyObject **)&v, &outpos, &p)) {
3319                 goto onError;
3320             }
3321         }
3322         else {
3323             p++;
3324             s += Py_UNICODE_SIZE;
3325         }
3326     }
3327
3328     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3329         goto onError;
3330     Py_XDECREF(errorHandler);
3331     Py_XDECREF(exc);
3332     return (PyObject *)v;
3333
3334  onError:
3335     Py_XDECREF(v);
3336     Py_XDECREF(errorHandler);
3337     Py_XDECREF(exc);
3338     return NULL;
3339 }
3340
3341 /* --- Latin-1 Codec ------------------------------------------------------ */
3342
3343 PyObject *PyUnicode_DecodeLatin1(const char *s,
3344                                  Py_ssize_t size,
3345                                  const char *errors)
3346 {
3347     PyUnicodeObject *v;
3348     Py_UNICODE *p;
3349
3350     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3351     if (size == 1) {
3352         Py_UNICODE r = *(unsigned char*)s;
3353         return PyUnicode_FromUnicode(&r, 1);
3354     }
3355
3356     v = _PyUnicode_New(size);
3357     if (v == NULL)
3358         goto onError;
3359     if (size == 0)
3360         return (PyObject *)v;
3361     p = PyUnicode_AS_UNICODE(v);
3362     while (size-- > 0)
3363         *p++ = (unsigned char)*s++;
3364     return (PyObject *)v;
3365
3366  onError:
3367     Py_XDECREF(v);
3368     return NULL;
3369 }
3370
3371 /* create or adjust a UnicodeEncodeError */
3372 static void make_encode_exception(PyObject **exceptionObject,
3373     const char *encoding,
3374     const Py_UNICODE *unicode, Py_ssize_t size,
3375     Py_ssize_t startpos, Py_ssize_t endpos,
3376     const char *reason)
3377 {
3378     if (*exceptionObject == NULL) {
3379         *exceptionObject = PyUnicodeEncodeError_Create(
3380             encoding, unicode, size, startpos, endpos, reason);
3381     }
3382     else {
3383         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3384             goto onError;
3385         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3386             goto onError;
3387         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3388             goto onError;
3389         return;
3390         onError:
3391         Py_DECREF(*exceptionObject);
3392         *exceptionObject = NULL;
3393     }
3394 }
3395
3396 /* raises a UnicodeEncodeError */
3397 static void raise_encode_exception(PyObject **exceptionObject,
3398     const char *encoding,
3399     const Py_UNICODE *unicode, Py_ssize_t size,
3400     Py_ssize_t startpos, Py_ssize_t endpos,
3401     const char *reason)
3402 {
3403     make_encode_exception(exceptionObject,
3404         encoding, unicode, size, startpos, endpos, reason);
3405     if (*exceptionObject != NULL)
3406         PyCodec_StrictErrors(*exceptionObject);
3407 }
3408
3409 /* error handling callback helper:
3410    build arguments, call the callback and check the arguments,
3411    put the result into newpos and return the replacement string, which
3412    has to be freed by the caller */
3413 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3414     PyObject **errorHandler,
3415     const char *encoding, const char *reason,
3416     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3417     Py_ssize_t startpos, Py_ssize_t endpos,
3418     Py_ssize_t *newpos)
3419 {
3420     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3421
3422     PyObject *restuple;
3423     PyObject *resunicode;
3424
3425     if (*errorHandler == NULL) {
3426         *errorHandler = PyCodec_LookupError(errors);
3427         if (*errorHandler == NULL)
3428             return NULL;
3429     }
3430
3431     make_encode_exception(exceptionObject,
3432         encoding, unicode, size, startpos, endpos, reason);
3433     if (*exceptionObject == NULL)
3434         return NULL;
3435
3436     restuple = PyObject_CallFunctionObjArgs(
3437         *errorHandler, *exceptionObject, NULL);
3438     if (restuple == NULL)
3439         return NULL;
3440     if (!PyTuple_Check(restuple)) {
3441         PyErr_Format(PyExc_TypeError, &argparse[4]);
3442         Py_DECREF(restuple);
3443         return NULL;
3444     }
3445     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3446         &resunicode, newpos)) {
3447         Py_DECREF(restuple);
3448         return NULL;
3449     }
3450     if (*newpos<0)
3451         *newpos = size+*newpos;
3452     if (*newpos<0 || *newpos>size) {
3453         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3454         Py_DECREF(restuple);
3455         return NULL;
3456     }
3457     Py_INCREF(resunicode);
3458     Py_DECREF(restuple);
3459     return resunicode;
3460 }
3461
3462 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3463                                  Py_ssize_t size,
3464                                  const char *errors,
3465                                  int limit)
3466 {
3467     /* output object */
3468     PyObject *res;
3469     /* pointers to the beginning and end+1 of input */
3470     const Py_UNICODE *startp = p;
3471     const Py_UNICODE *endp = p + size;
3472     /* pointer to the beginning of the unencodable characters */
3473     /* const Py_UNICODE *badp = NULL; */
3474     /* pointer into the output */
3475     char *str;
3476     /* current output position */
3477     Py_ssize_t respos = 0;
3478     Py_ssize_t ressize;
3479     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3480     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3481     PyObject *errorHandler = NULL;
3482     PyObject *exc = NULL;
3483     /* the following variable is used for caching string comparisons
3484      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3485     int known_errorHandler = -1;
3486
3487     /* allocate enough for a simple encoding without
3488        replacements, if we need more, we'll resize */
3489     res = PyString_FromStringAndSize(NULL, size);
3490     if (res == NULL)
3491         goto onError;
3492     if (size == 0)
3493         return res;
3494     str = PyString_AS_STRING(res);
3495     ressize = size;
3496
3497     while (p<endp) {
3498         Py_UNICODE c = *p;
3499
3500         /* can we encode this? */
3501         if (c<limit) {
3502             /* no overflow check, because we know that the space is enough */
3503             *str++ = (char)c;
3504             ++p;
3505         }
3506         else {
3507             Py_ssize_t unicodepos = p-startp;
3508             Py_ssize_t requiredsize;
3509             PyObject *repunicode;
3510             Py_ssize_t repsize;
3511             Py_ssize_t newpos;
3512             Py_ssize_t respos;
3513             Py_UNICODE *uni2;
3514             /* startpos for collecting unencodable chars */
3515             const Py_UNICODE *collstart = p;
3516             const Py_UNICODE *collend = p;
3517             /* find all unecodable characters */
3518             while ((collend < endp) && ((*collend)>=limit))
3519                 ++collend;
3520             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3521             if (known_errorHandler==-1) {
3522                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3523                     known_errorHandler = 1;
3524                 else if (!strcmp(errors, "replace"))
3525                     known_errorHandler = 2;
3526                 else if (!strcmp(errors, "ignore"))
3527                     known_errorHandler = 3;
3528                 else if (!strcmp(errors, "xmlcharrefreplace"))
3529                     known_errorHandler = 4;
3530                 else
3531                     known_errorHandler = 0;
3532             }
3533             switch (known_errorHandler) {
3534                 case 1: /* strict */
3535                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3536                     goto onError;
3537                 case 2: /* replace */
3538                     while (collstart++<collend)
3539                         *str++ = '?'; /* fall through */
3540                 case 3: /* ignore */
3541                     p = collend;
3542                     break;
3543                 case 4: /* xmlcharrefreplace */
3544                     respos = str-PyString_AS_STRING(res);
3545                     /* determine replacement size (temporarily (mis)uses p) */
3546                     for (p = collstart, repsize = 0; p < collend; ++p) {
3547                         if (*p<10)
3548                             repsize += 2+1+1;
3549                         else if (*p<100)
3550                             repsize += 2+2+1;
3551                         else if (*p<1000)
3552                             repsize += 2+3+1;
3553                         else if (*p<10000)
3554                             repsize += 2+4+1;
3555 #ifndef Py_UNICODE_WIDE
3556                         else
3557                             repsize += 2+5+1;
3558 #else
3559                         else if (*p<100000)
3560                             repsize += 2+5+1;
3561                         else if (*p<1000000)
3562                             repsize += 2+6+1;
3563                         else
3564                             repsize += 2+7+1;
3565 #endif
3566                     }
3567                     requiredsize = respos+repsize+(endp-collend);
3568                     if (requiredsize > ressize) {
3569                         if (requiredsize<2*ressize)
3570                             requiredsize = 2*ressize;
3571                         if (_PyString_Resize(&res, requiredsize))
3572                             goto onError;
3573                         str = PyString_AS_STRING(res) + respos;
3574                         ressize = requiredsize;
3575                     }
3576                     /* generate replacement (temporarily (mis)uses p) */
3577                     for (p = collstart; p < collend; ++p) {
3578                         str += sprintf(str, "&#%d;", (int)*p);
3579                     }
3580                     p = collend;
3581                     break;
3582                 default:
3583                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3584                         encoding, reason, startp, size, &exc,
3585                         collstart-startp, collend-startp, &newpos);
3586                     if (repunicode == NULL)
3587                         goto onError;
3588                     /* need more space? (at least enough for what we
3589                        have+the replacement+the rest of the string, so
3590                        we won't have to check space for encodable characters) */
3591                     respos = str-PyString_AS_STRING(res);
3592                     repsize = PyUnicode_GET_SIZE(repunicode);
3593                     requiredsize = respos+repsize+(endp-collend);
3594                     if (requiredsize > ressize) {
3595                         if (requiredsize<2*ressize)
3596                             requiredsize = 2*ressize;
3597                         if (_PyString_Resize(&res, requiredsize)) {
3598                             Py_DECREF(repunicode);
3599                             goto onError;
3600                         }
3601                         str = PyString_AS_STRING(res) + respos;
3602                         ressize = requiredsize;
3603                     }
3604                     /* check if there is anything unencodable in the replacement
3605                        and copy it to the output */
3606                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3607                         c = *uni2;
3608                         if (c >= limit) {
3609                             raise_encode_exception(&exc, encoding, startp, size,
3610                                 unicodepos, unicodepos+1, reason);
3611                             Py_DECREF(repunicode);
3612                             goto onError;
3613                         }
3614                         *str = (char)c;
3615                     }
3616                     p = startp + newpos;
3617                     Py_DECREF(repunicode);
3618             }
3619         }
3620     }
3621     /* Resize if we allocated to much */
3622     respos = str-PyString_AS_STRING(res);
3623     if (respos<ressize)
3624        /* If this falls res will be NULL */
3625         _PyString_Resize(&res, respos);
3626     Py_XDECREF(errorHandler);
3627     Py_XDECREF(exc);
3628     return res;
3629
3630     onError:
3631     Py_XDECREF(res);
3632     Py_XDECREF(errorHandler);
3633     Py_XDECREF(exc);
3634     return NULL;
3635 }
3636
3637 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3638                                  Py_ssize_t size,
3639                                  const char *errors)
3640 {
3641     return unicode_encode_ucs1(p, size, errors, 256);
3642 }
3643
3644 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3645 {
3646     if (!PyUnicode_Check(unicode)) {
3647         PyErr_BadArgument();
3648         return NULL;
3649     }
3650     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3651                                   PyUnicode_GET_SIZE(unicode),
3652                                   NULL);
3653 }
3654
3655 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3656
3657 PyObject *PyUnicode_DecodeASCII(const char *s,
3658                                 Py_ssize_t size,
3659                                 const char *errors)
3660 {
3661     const char *starts = s;
3662     PyUnicodeObject *v;
3663     Py_UNICODE *p;
3664     Py_ssize_t startinpos;
3665     Py_ssize_t endinpos;
3666     Py_ssize_t outpos;
3667     const char *e;
3668     PyObject *errorHandler = NULL;
3669     PyObject *exc = NULL;
3670
3671     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3672     if (size == 1 && *(unsigned char*)s < 128) {
3673         Py_UNICODE r = *(unsigned char*)s;
3674         return PyUnicode_FromUnicode(&r, 1);
3675     }
3676
3677     v = _PyUnicode_New(size);
3678     if (v == NULL)
3679         goto onError;
3680     if (size == 0)
3681         return (PyObject *)v;
3682     p = PyUnicode_AS_UNICODE(v);
3683     e = s + size;
3684     while (s < e) {
3685         register unsigned char c = (unsigned char)*s;
3686         if (c < 128) {
3687             *p++ = c;
3688             ++s;
3689         }
3690         else {
3691             startinpos = s-starts;
3692             endinpos = startinpos + 1;
3693             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3694             if (unicode_decode_call_errorhandler(
3695                  errors, &errorHandler,
3696                  "ascii", "ordinal not in range(128)",
3697                  starts, size, &startinpos, &endinpos, &exc, &s,
3698                  (PyObject **)&v, &outpos, &p))
3699                 goto onError;
3700         }
3701     }
3702     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3703         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3704             goto onError;
3705     Py_XDECREF(errorHandler);
3706     Py_XDECREF(exc);
3707     return (PyObject *)v;
3708
3709  onError:
3710     Py_XDECREF(v);
3711     Py_XDECREF(errorHandler);
3712     Py_XDECREF(exc);
3713     return NULL;
3714 }
3715
3716 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3717                                 Py_ssize_t size,
3718                                 const char *errors)
3719 {
3720     return unicode_encode_ucs1(p, size, errors, 128);
3721 }
3722
3723 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3724 {
3725     if (!PyUnicode_Check(unicode)) {
3726         PyErr_BadArgument();
3727         return NULL;
3728     }
3729     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3730                                  PyUnicode_GET_SIZE(unicode),
3731                                  NULL);
3732 }
3733
3734 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3735
3736 /* --- MBCS codecs for Windows -------------------------------------------- */
3737
3738 #if SIZEOF_INT < SIZEOF_SSIZE_T
3739 #define NEED_RETRY
3740 #endif
3741
3742 /* XXX This code is limited to "true" double-byte encodings, as
3743    a) it assumes an incomplete character consists of a single byte, and
3744    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3745       encodings, see IsDBCSLeadByteEx documentation. */
3746
3747 static int is_dbcs_lead_byte(const char *s, int offset)
3748 {
3749     const char *curr = s + offset;
3750
3751     if (IsDBCSLeadByte(*curr)) {
3752         const char *prev = CharPrev(s, curr);
3753         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3754     }
3755     return 0;
3756 }
3757
3758 /*
3759  * Decode MBCS string into unicode object. If 'final' is set, converts
3760  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3761  */
3762 static int decode_mbcs(PyUnicodeObject **v,
3763                         const char *s, /* MBCS string */
3764                         int size, /* sizeof MBCS string */
3765                         int final)
3766 {
3767     Py_UNICODE *p;
3768     Py_ssize_t n = 0;
3769     int usize = 0;
3770
3771     assert(size >= 0);
3772
3773     /* Skip trailing lead-byte unless 'final' is set */
3774     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3775         --size;
3776
3777     /* First get the size of the result */
3778     if (size > 0) {
3779         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3780         if (usize == 0) {
3781             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3782             return -1;
3783         }
3784     }
3785
3786     if (*v == NULL) {
3787         /* Create unicode object */
3788         *v = _PyUnicode_New(usize);
3789         if (*v == NULL)
3790             return -1;
3791     }
3792     else {
3793         /* Extend unicode object */
3794         n = PyUnicode_GET_SIZE(*v);
3795         if (_PyUnicode_Resize(v, n + usize) < 0)
3796             return -1;
3797     }
3798
3799     /* Do the conversion */
3800     if (size > 0) {
3801         p = PyUnicode_AS_UNICODE(*v) + n;
3802         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3803             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3804             return -1;
3805         }
3806     }
3807
3808     return size;
3809 }
3810
3811 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3812                                         Py_ssize_t size,
3813                                         const char *errors,
3814                                         Py_ssize_t *consumed)
3815 {
3816     PyUnicodeObject *v = NULL;
3817     int done;
3818
3819     if (consumed)
3820         *consumed = 0;
3821
3822 #ifdef NEED_RETRY
3823   retry:
3824     if (size > INT_MAX)
3825         done = decode_mbcs(&v, s, INT_MAX, 0);
3826     else
3827 #endif
3828         done = decode_mbcs(&v, s, (int)size, !consumed);
3829
3830     if (done < 0) {
3831         Py_XDECREF(v);
3832         return NULL;
3833     }
3834
3835     if (consumed)
3836         *consumed += done;
3837
3838 #ifdef NEED_RETRY
3839     if (size > INT_MAX) {
3840         s += done;
3841         size -= done;
3842         goto retry;
3843     }
3844 #endif
3845
3846     return (PyObject *)v;
3847 }
3848
3849 PyObject *PyUnicode_DecodeMBCS(const char *s,
3850                                 Py_ssize_t size,
3851                                 const char *errors)
3852 {
3853     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3854 }
3855
3856 /*
3857  * Convert unicode into string object (MBCS).
3858  * Returns 0 if succeed, -1 otherwise.
3859  */
3860 static int encode_mbcs(PyObject **repr,
3861                         const Py_UNICODE *p, /* unicode */
3862                         int size) /* size of unicode */
3863 {
3864     int mbcssize = 0;
3865     Py_ssize_t n = 0;
3866
3867     assert(size >= 0);
3868
3869     /* First get the size of the result */
3870     if (size > 0) {
3871         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3872         if (mbcssize == 0) {
3873             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3874             return -1;
3875         }
3876     }
3877
3878     if (*repr == NULL) {
3879         /* Create string object */
3880         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3881         if (*repr == NULL)
3882             return -1;
3883     }
3884     else {
3885         /* Extend string object */
3886         n = PyString_Size(*repr);
3887         if (_PyString_Resize(repr, n + mbcssize) < 0)
3888             return -1;
3889     }
3890
3891     /* Do the conversion */
3892     if (size > 0) {
3893         char *s = PyString_AS_STRING(*repr) + n;
3894         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3895             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3896             return -1;
3897         }
3898     }
3899
3900     return 0;
3901 }
3902
3903 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3904                                 Py_ssize_t size,
3905                                 const char *errors)
3906 {
3907     PyObject *repr = NULL;
3908     int ret;
3909
3910 #ifdef NEED_RETRY
3911  retry:
3912     if (size > INT_MAX)
3913         ret = encode_mbcs(&repr, p, INT_MAX);
3914     else
3915 #endif
3916         ret = encode_mbcs(&repr, p, (int)size);
3917
3918     if (ret < 0) {
3919         Py_XDECREF(repr);
3920         return NULL;
3921     }
3922
3923 #ifdef NEED_RETRY
3924     if (size > INT_MAX) {
3925         p += INT_MAX;
3926         size -= INT_MAX;
3927         goto retry;
3928     }
3929 #endif
3930
3931     return repr;
3932 }
3933
3934 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3935 {
3936     if (!PyUnicode_Check(unicode)) {
3937         PyErr_BadArgument();
3938         return NULL;
3939     }
3940     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3941                                 PyUnicode_GET_SIZE(unicode),
3942                                 NULL);
3943 }
3944
3945 #undef NEED_RETRY
3946
3947 #endif /* MS_WINDOWS */
3948
3949 /* --- Character Mapping Codec -------------------------------------------- */
3950
3951 PyObject *PyUnicode_DecodeCharmap(const char *s,
3952                                   Py_ssize_t size,
3953                                   PyObject *mapping,
3954                                   const char *errors)
3955 {
3956     const char *starts = s;
3957     Py_ssize_t startinpos;
3958     Py_ssize_t endinpos;
3959     Py_ssize_t outpos;
3960     const char *e;
3961     PyUnicodeObject *v;
3962     Py_UNICODE *p;
3963     Py_ssize_t extrachars = 0;
3964     PyObject *errorHandler = NULL;
3965     PyObject *exc = NULL;
3966     Py_UNICODE *mapstring = NULL;
3967     Py_ssize_t maplen = 0;
3968
3969     /* Default to Latin-1 */
3970     if (mapping == NULL)
3971         return PyUnicode_DecodeLatin1(s, size, errors);
3972
3973     v = _PyUnicode_New(size);
3974     if (v == NULL)
3975         goto onError;
3976     if (size == 0)
3977         return (PyObject *)v;
3978     p = PyUnicode_AS_UNICODE(v);
3979     e = s + size;
3980     if (PyUnicode_CheckExact(mapping)) {
3981         mapstring = PyUnicode_AS_UNICODE(mapping);
3982         maplen = PyUnicode_GET_SIZE(mapping);
3983         while (s < e) {
3984             unsigned char ch = *s;
3985             Py_UNICODE x = 0xfffe; /* illegal value */
3986
3987             if (ch < maplen)
3988                 x = mapstring[ch];
3989
3990             if (x == 0xfffe) {
3991                 /* undefined mapping */
3992                 outpos = p-PyUnicode_AS_UNICODE(v);
3993                 startinpos = s-starts;
3994                 endinpos = startinpos+1;
3995                 if (unicode_decode_call_errorhandler(
3996                      errors, &errorHandler,
3997                      "charmap", "character maps to <undefined>",
3998                      starts, size, &startinpos, &endinpos, &exc, &s,
3999                      (PyObject **)&v, &outpos, &p)) {
4000                     goto onError;
4001                 }
4002                 continue;
4003             }
4004             *p++ = x;
4005             ++s;
4006         }
4007     }
4008     else {
4009         while (s < e) {
4010             unsigned char ch = *s;
4011             PyObject *w, *x;
4012
4013             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4014             w = PyInt_FromLong((long)ch);
4015             if (w == NULL)
4016                 goto onError;
4017             x = PyObject_GetItem(mapping, w);
4018             Py_DECREF(w);
4019             if (x == NULL) {
4020                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4021                     /* No mapping found means: mapping is undefined. */
4022                     PyErr_Clear();
4023                     x = Py_None;
4024                     Py_INCREF(x);
4025                 } else
4026                     goto onError;
4027             }
4028
4029             /* Apply mapping */
4030             if (PyInt_Check(x)) {
4031                 long value = PyInt_AS_LONG(x);
4032                 if (value < 0 || value > 65535) {
4033                     PyErr_SetString(PyExc_TypeError,
4034                                     "character mapping must be in range(65536)");
4035                     Py_DECREF(x);
4036                     goto onError;
4037                 }
4038                 *p++ = (Py_UNICODE)value;
4039             }
4040             else if (x == Py_None) {
4041                 /* undefined mapping */
4042                 outpos = p-PyUnicode_AS_UNICODE(v);
4043                 startinpos = s-starts;
4044                 endinpos = startinpos+1;
4045                 if (unicode_decode_call_errorhandler(
4046                      errors, &errorHandler,
4047                      "charmap", "character maps to <undefined>",
4048                      starts, size, &startinpos, &endinpos, &exc, &s,
4049                      (PyObject **)&v, &outpos, &p)) {
4050                     Py_DECREF(x);
4051                     goto onError;
4052                 }
4053                 Py_DECREF(x);
4054                 continue;
4055             }
4056             else if (PyUnicode_Check(x)) {
4057                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4058
4059                 if (targetsize == 1)
4060                     /* 1-1 mapping */
4061                     *p++ = *PyUnicode_AS_UNICODE(x);
4062
4063                 else if (targetsize > 1) {
4064                     /* 1-n mapping */
4065                     if (targetsize > extrachars) {
4066                         /* resize first */
4067                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4068                         Py_ssize_t needed = (targetsize - extrachars) + \
4069                                      (targetsize << 2);
4070                         extrachars += needed;
4071                         /* XXX overflow detection missing */
4072                         if (_PyUnicode_Resize(&v,
4073                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4074                             Py_DECREF(x);
4075                             goto onError;
4076                         }
4077                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4078                     }
4079                     Py_UNICODE_COPY(p,
4080                                     PyUnicode_AS_UNICODE(x),
4081                                     targetsize);
4082                     p += targetsize;
4083                     extrachars -= targetsize;
4084                 }
4085                 /* 1-0 mapping: skip the character */
4086             }
4087             else {
4088                 /* wrong return value */
4089                 PyErr_SetString(PyExc_TypeError,
4090                       "character mapping must return integer, None or unicode");
4091                 Py_DECREF(x);
4092                 goto onError;
4093             }
4094             Py_DECREF(x);
4095             ++s;
4096         }
4097     }
4098     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4099         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4100             goto onError;
4101     Py_XDECREF(errorHandler);
4102     Py_XDECREF(exc);
4103     return (PyObject *)v;
4104
4105  onError:
4106     Py_XDECREF(errorHandler);
4107     Py_XDECREF(exc);
4108     Py_XDECREF(v);
4109     return NULL;
4110 }
4111
4112 /* Charmap encoding: the lookup table */
4113
4114 struct encoding_map{
4115   PyObject_HEAD
4116   unsigned char level1[32];
4117   int count2, count3;
4118   unsigned char level23[1];
4119 };
4120
4121 static PyObject*
4122 encoding_map_size(PyObject *obj, PyObject* args)
4123 {
4124     struct encoding_map *map = (struct encoding_map*)obj;
4125     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4126                           128*map->count3);
4127 }
4128
4129 static PyMethodDef encoding_map_methods[] = {
4130         {"size", encoding_map_size, METH_NOARGS,
4131          PyDoc_STR("Return the size (in bytes) of this object") },
4132         { 0 }
4133 };
4134
4135 static void
4136 encoding_map_dealloc(PyObject* o)
4137 {
4138         PyObject_FREE(o);
4139 }
4140
4141 static PyTypeObject EncodingMapType = {
4142         PyVarObject_HEAD_INIT(NULL, 0)
4143         "EncodingMap",          /*tp_name*/
4144         sizeof(struct encoding_map),   /*tp_basicsize*/
4145         0,                      /*tp_itemsize*/
4146         /* methods */
4147         encoding_map_dealloc,   /*tp_dealloc*/
4148         0,                      /*tp_print*/
4149         0,                      /*tp_getattr*/
4150         0,                      /*tp_setattr*/
4151         0,                      /*tp_compare*/
4152         0,                      /*tp_repr*/
4153         0,                      /*tp_as_number*/
4154         0,                      /*tp_as_sequence*/
4155         0,                      /*tp_as_mapping*/
4156         0,                      /*tp_hash*/
4157         0,                      /*tp_call*/
4158         0,                      /*tp_str*/
4159         0,                      /*tp_getattro*/
4160         0,                      /*tp_setattro*/
4161         0,                      /*tp_as_buffer*/
4162         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4163         0,                      /*tp_doc*/
4164         0,                      /*tp_traverse*/
4165         0,                      /*tp_clear*/
4166         0,                      /*tp_richcompare*/
4167         0,                      /*tp_weaklistoffset*/
4168         0,                      /*tp_iter*/
4169         0,                      /*tp_iternext*/
4170         encoding_map_methods,   /*tp_methods*/
4171         0,                      /*tp_members*/
4172         0,                      /*tp_getset*/
4173         0,                      /*tp_base*/
4174         0,                      /*tp_dict*/
4175         0,                      /*tp_descr_get*/
4176         0,                      /*tp_descr_set*/
4177         0,                      /*tp_dictoffset*/
4178         0,                      /*tp_init*/
4179         0,                      /*tp_alloc*/
4180         0,                      /*tp_new*/
4181         0,                      /*tp_free*/
4182         0,                      /*tp_is_gc*/
4183 };
4184
4185 PyObject*
4186 PyUnicode_BuildEncodingMap(PyObject* string)
4187 {
4188     Py_UNICODE *decode;
4189     PyObject *result;
4190     struct encoding_map *mresult;
4191     int i;
4192     int need_dict = 0;
4193     unsigned char level1[32];
4194     unsigned char level2[512];
4195     unsigned char *mlevel1, *mlevel2, *mlevel3;
4196     int count2 = 0, count3 = 0;
4197
4198     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4199         PyErr_BadArgument();
4200         return NULL;
4201     }
4202     decode = PyUnicode_AS_UNICODE(string);
4203     memset(level1, 0xFF, sizeof level1);
4204     memset(level2, 0xFF, sizeof level2);
4205
4206     /* If there isn't a one-to-one mapping of NULL to \0,
4207        or if there are non-BMP characters, we need to use
4208        a mapping dictionary. */
4209     if (decode[0] != 0)
4210         need_dict = 1;
4211     for (i = 1; i < 256; i++) {
4212         int l1, l2;
4213         if (decode[i] == 0
4214             #ifdef Py_UNICODE_WIDE
4215             || decode[i] > 0xFFFF
4216             #endif
4217         ) {
4218             need_dict = 1;
4219             break;
4220         }
4221         if (decode[i] == 0xFFFE)
4222             /* unmapped character */
4223             continue;
4224         l1 = decode[i] >> 11;
4225         l2 = decode[i] >> 7;
4226         if (level1[l1] == 0xFF)
4227             level1[l1] = count2++;
4228         if (level2[l2] == 0xFF)
4229             level2[l2] = count3++;
4230     }
4231
4232     if (count2 >= 0xFF || count3 >= 0xFF)
4233         need_dict = 1;
4234
4235     if (need_dict) {
4236         PyObject *result = PyDict_New();
4237         PyObject *key, *value;
4238         if (!result)
4239             return NULL;
4240         for (i = 0; i < 256; i++) {
4241             key = value = NULL;
4242             key = PyInt_FromLong(decode[i]);
4243             value = PyInt_FromLong(i);
4244             if (!key || !value)
4245                 goto failed1;
4246             if (PyDict_SetItem(result, key, value) == -1)
4247                 goto failed1;
4248             Py_DECREF(key);
4249             Py_DECREF(value);
4250         }
4251         return result;
4252       failed1:
4253         Py_XDECREF(key);
4254         Py_XDECREF(value);
4255         Py_DECREF(result);
4256         return NULL;
4257     }
4258
4259     /* Create a three-level trie */
4260     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4261                              16*count2 + 128*count3 - 1);
4262     if (!result)
4263         return PyErr_NoMemory();
4264     PyObject_Init(result, &EncodingMapType);
4265     mresult = (struct encoding_map*)result;
4266     mresult->count2 = count2;
4267     mresult->count3 = count3;
4268     mlevel1 = mresult->level1;
4269     mlevel2 = mresult->level23;
4270     mlevel3 = mresult->level23 + 16*count2;
4271     memcpy(mlevel1, level1, 32);
4272     memset(mlevel2, 0xFF, 16*count2);
4273     memset(mlevel3, 0, 128*count3);
4274     count3 = 0;
4275     for (i = 1; i < 256; i++) {
4276         int o1, o2, o3, i2, i3;
4277         if (decode[i] == 0xFFFE)
4278             /* unmapped character */
4279             continue;
4280         o1 = decode[i]>>11;
4281         o2 = (decode[i]>>7) & 0xF;
4282         i2 = 16*mlevel1[o1] + o2;
4283         if (mlevel2[i2] == 0xFF)
4284             mlevel2[i2] = count3++;
4285         o3 = decode[i] & 0x7F;
4286         i3 = 128*mlevel2[i2] + o3;
4287         mlevel3[i3] = i;
4288     }
4289     return result;
4290 }
4291
4292 static int
4293 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4294 {
4295     struct encoding_map *map = (struct encoding_map*)mapping;
4296     int l1 = c>>11;
4297     int l2 = (c>>7) & 0xF;
4298     int l3 = c & 0x7F;
4299     int i;
4300
4301 #ifdef Py_UNICODE_WIDE
4302     if (c > 0xFFFF) {
4303         return -1;
4304     }
4305 #endif
4306     if (c == 0)
4307         return 0;
4308     /* level 1*/
4309     i = map->level1[l1];
4310     if (i == 0xFF) {
4311         return -1;
4312     }
4313     /* level 2*/
4314     i = map->level23[16*i+l2];
4315     if (i == 0xFF) {
4316         return -1;
4317     }
4318     /* level 3 */
4319     i = map->level23[16*map->count2 + 128*i + l3];
4320     if (i == 0) {
4321         return -1;
4322     }
4323     return i;
4324 }
4325
4326 /* Lookup the character ch in the mapping. If the character
4327    can't be found, Py_None is returned (or NULL, if another
4328    error occurred). */
4329 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4330 {
4331     PyObject *w = PyInt_FromLong((long)c);
4332     PyObject *x;
4333
4334     if (w == NULL)
4335          return NULL;
4336     x = PyObject_GetItem(mapping, w);
4337     Py_DECREF(w);
4338     if (x == NULL) {
4339         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4340             /* No mapping found means: mapping is undefined. */
4341             PyErr_Clear();
4342             x = Py_None;
4343             Py_INCREF(x);
4344             return x;
4345         } else
4346             return NULL;
4347     }
4348     else if (x == Py_None)
4349         return x;
4350     else if (PyInt_Check(x)) {
4351         long value = PyInt_AS_LONG(x);
4352         if (value < 0 || value > 255) {
4353             PyErr_SetString(PyExc_TypeError,
4354                              "character mapping must be in range(256)");
4355             Py_DECREF(x);
4356             return NULL;
4357         }
4358         return x;
4359     }
4360     else if (PyString_Check(x))
4361         return x;
4362     else {
4363         /* wrong return value */
4364         PyErr_SetString(PyExc_TypeError,
4365               "character mapping must return integer, None or str");
4366         Py_DECREF(x);
4367         return NULL;
4368     }
4369 }
4370
4371 static int
4372 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4373 {
4374         Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4375         /* exponentially overallocate to minimize reallocations */
4376         if (requiredsize < 2*outsize)
4377             requiredsize = 2*outsize;
4378         if (_PyString_Resize(outobj, requiredsize)) {
4379             return 0;
4380         }
4381         return 1;
4382 }
4383
4384 typedef enum charmapencode_result {
4385   enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4386 }charmapencode_result;
4387 /* lookup the character, put the result in the output string and adjust
4388    various state variables. Reallocate the output string if not enough
4389    space is available. Return a new reference to the object that
4390    was put in the output buffer, or Py_None, if the mapping was undefined
4391    (in which case no character was written) or NULL, if a
4392    reallocation error occurred. The caller must decref the result */
4393 static
4394 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4395     PyObject **outobj, Py_ssize_t *outpos)
4396 {
4397     PyObject *rep;
4398     char *outstart;
4399     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4400
4401     if (Py_TYPE(mapping) == &EncodingMapType) {
4402         int res = encoding_map_lookup(c, mapping);
4403         Py_ssize_t requiredsize = *outpos+1;
4404         if (res == -1)
4405             return enc_FAILED;
4406         if (outsize<requiredsize)
4407             if (!charmapencode_resize(outobj, outpos, requiredsize))
4408                 return enc_EXCEPTION;
4409         outstart = PyString_AS_STRING(*outobj);
4410         outstart[(*outpos)++] = (char)res;
4411         return enc_SUCCESS;
4412     }
4413
4414     rep = charmapencode_lookup(c, mapping);
4415     if (rep==NULL)
4416         return enc_EXCEPTION;
4417     else if (rep==Py_None) {
4418         Py_DECREF(rep);
4419         return enc_FAILED;
4420     } else {
4421         if (PyInt_Check(rep)) {
4422             Py_ssize_t requiredsize = *outpos+1;
4423             if (outsize<requiredsize)
4424                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4425                     Py_DECREF(rep);
4426                     return enc_EXCEPTION;
4427                 }
4428             outstart = PyString_AS_STRING(*outobj);
4429             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4430         }
4431         else {
4432             const char *repchars = PyString_AS_STRING(rep);
4433             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4434             Py_ssize_t requiredsize = *outpos+repsize;
4435             if (outsize<requiredsize)
4436                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4437                     Py_DECREF(rep);
4438                     return enc_EXCEPTION;
4439                 }
4440             outstart = PyString_AS_STRING(*outobj);
4441             memcpy(outstart + *outpos, repchars, repsize);
4442             *outpos += repsize;
4443         }
4444     }
4445     Py_DECREF(rep);
4446     return enc_SUCCESS;
4447 }
4448
4449 /* handle an error in PyUnicode_EncodeCharmap
4450    Return 0 on success, -1 on error */
4451 static
4452 int charmap_encoding_error(
4453     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4454     PyObject **exceptionObject,
4455     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4456     PyObject **res, Py_ssize_t *respos)
4457 {
4458     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4459     Py_ssize_t repsize;
4460     Py_ssize_t newpos;
4461     Py_UNICODE *uni2;
4462     /* startpos for collecting unencodable chars */
4463     Py_ssize_t collstartpos = *inpos;
4464     Py_ssize_t collendpos = *inpos+1;
4465     Py_ssize_t collpos;
4466     char *encoding = "charmap";
4467     char *reason = "character maps to <undefined>";
4468     charmapencode_result x;
4469
4470     /* find all unencodable characters */
4471     while (collendpos < size) {
4472         PyObject *rep;
4473         if (Py_TYPE(mapping) == &EncodingMapType) {
4474             int res = encoding_map_lookup(p[collendpos], mapping);
4475             if (res != -1)
4476                 break;
4477             ++collendpos;
4478             continue;
4479         }
4480
4481         rep = charmapencode_lookup(p[collendpos], mapping);
4482         if (rep==NULL)
4483             return -1;
4484         else if (rep!=Py_None) {
4485             Py_DECREF(rep);
4486             break;
4487         }
4488         Py_DECREF(rep);
4489         ++collendpos;
4490     }
4491     /* cache callback name lookup
4492      * (if not done yet, i.e. it's the first error) */
4493     if (*known_errorHandler==-1) {
4494         if ((errors==NULL) || (!strcmp(errors, "strict")))
4495             *known_errorHandler = 1;
4496         else if (!strcmp(errors, "replace"))
4497             *known_errorHandler = 2;
4498         else if (!strcmp(errors, "ignore"))
4499             *known_errorHandler = 3;
4500         else if (!strcmp(errors, "xmlcharrefreplace"))
4501             *known_errorHandler = 4;
4502         else
4503             *known_errorHandler = 0;
4504     }
4505     switch (*known_errorHandler) {
4506         case 1: /* strict */
4507             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4508             return -1;
4509         case 2: /* replace */
4510             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4511                 x = charmapencode_output('?', mapping, res, respos);
4512                 if (x==enc_EXCEPTION) {
4513                     return -1;
4514                 }
4515                 else if (x==enc_FAILED) {
4516                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4517                     return -1;
4518                 }
4519             }
4520             /* fall through */
4521         case 3: /* ignore */
4522             *inpos = collendpos;
4523             break;
4524         case 4: /* xmlcharrefreplace */
4525             /* generate replacement (temporarily (mis)uses p) */
4526             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4527                 char buffer[2+29+1+1];
4528                 char *cp;
4529                 sprintf(buffer, "&#%d;", (int)p[collpos]);
4530                 for (cp = buffer; *cp; ++cp) {
4531                     x = charmapencode_output(*cp, mapping, res, respos);
4532                     if (x==enc_EXCEPTION)
4533                         return -1;
4534                     else if (x==enc_FAILED) {
4535                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4536                         return -1;
4537                     }
4538                 }
4539             }
4540             *inpos = collendpos;
4541             break;
4542         default:
4543             repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4544                 encoding, reason, p, size, exceptionObject,
4545                 collstartpos, collendpos, &newpos);
4546             if (repunicode == NULL)
4547                 return -1;
4548             /* generate replacement  */
4549             repsize = PyUnicode_GET_SIZE(repunicode);
4550             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4551                 x = charmapencode_output(*uni2, mapping, res, respos);
4552                 if (x==enc_EXCEPTION) {
4553                     return -1;
4554                 }
4555                 else if (x==enc_FAILED) {
4556                     Py_DECREF(repunicode);
4557                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4558                     return -1;
4559                 }
4560             }
4561             *inpos = newpos;
4562             Py_DECREF(repunicode);
4563     }
4564     return 0;
4565 }
4566
4567 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4568                                   Py_ssize_t size,
4569                                   PyObject *mapping,
4570                                   const char *errors)
4571 {
4572     /* output object */
4573     PyObject *res = NULL;
4574     /* current input position */
4575     Py_ssize_t inpos = 0;
4576     /* current output position */
4577     Py_ssize_t respos = 0;
4578     PyObject *errorHandler = NULL;
4579     PyObject *exc = NULL;
4580     /* the following variable is used for caching string comparisons
4581      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4582      * 3=ignore, 4=xmlcharrefreplace */
4583     int known_errorHandler = -1;
4584
4585     /* Default to Latin-1 */
4586     if (mapping == NULL)
4587         return PyUnicode_EncodeLatin1(p, size, errors);
4588
4589     /* allocate enough for a simple encoding without
4590        replacements, if we need more, we'll resize */
4591     res = PyString_FromStringAndSize(NULL, size);
4592     if (res == NULL)
4593         goto onError;
4594     if (size == 0)
4595         return res;
4596
4597     while (inpos<size) {
4598         /* try to encode it */
4599         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4600         if (x==enc_EXCEPTION) /* error */
4601             goto onError;
4602         if (x==enc_FAILED) { /* unencodable character */
4603             if (charmap_encoding_error(p, size, &inpos, mapping,
4604                 &exc,
4605                 &known_errorHandler, &errorHandler, errors,
4606                 &res, &respos)) {
4607                 goto onError;
4608             }
4609         }
4610         else
4611             /* done with this character => adjust input position */
4612             ++inpos;
4613     }
4614
4615     /* Resize if we allocated to much */
4616     if (respos<PyString_GET_SIZE(res)) {
4617         if (_PyString_Resize(&res, respos))
4618             goto onError;
4619     }
4620     Py_XDECREF(exc);
4621     Py_XDECREF(errorHandler);
4622     return res;
4623
4624     onError:
4625     Py_XDECREF(res);
4626     Py_XDECREF(exc);
4627     Py_XDECREF(errorHandler);
4628     return NULL;
4629 }
4630
4631 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4632                                     PyObject *mapping)
4633 {
4634     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4635         PyErr_BadArgument();
4636         return NULL;
4637     }
4638     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4639                                    PyUnicode_GET_SIZE(unicode),
4640                                    mapping,
4641                                    NULL);
4642 }
4643
4644 /* create or adjust a UnicodeTranslateError */
4645 static void make_translate_exception(PyObject **exceptionObject,
4646     const Py_UNICODE *unicode, Py_ssize_t size,
4647     Py_ssize_t startpos, Py_ssize_t endpos,
4648     const char *reason)
4649 {
4650     if (*exceptionObject == NULL) {
4651         *exceptionObject = PyUnicodeTranslateError_Create(
4652             unicode, size, startpos, endpos, reason);
4653     }
4654     else {
4655         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4656             goto onError;
4657         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4658             goto onError;
4659         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4660             goto onError;
4661         return;
4662         onError:
4663         Py_DECREF(*exceptionObject);
4664         *exceptionObject = NULL;
4665     }
4666 }
4667
4668 /* raises a UnicodeTranslateError */
4669 static void raise_translate_exception(PyObject **exceptionObject,
4670     const Py_UNICODE *unicode, Py_ssize_t size,
4671     Py_ssize_t startpos, Py_ssize_t endpos,
4672     const char *reason)
4673 {
4674     make_translate_exception(exceptionObject,
4675         unicode, size, startpos, endpos, reason);
4676     if (*exceptionObject != NULL)
4677         PyCodec_StrictErrors(*exceptionObject);
4678 }
4679
4680 /* error handling callback helper:
4681    build arguments, call the callback and check the arguments,
4682    put the result into newpos and return the replacement string, which
4683    has to be freed by the caller */
4684 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4685     PyObject **errorHandler,
4686     const char *reason,
4687     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4688     Py_ssize_t startpos, Py_ssize_t endpos,
4689     Py_ssize_t *newpos)
4690 {
4691     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4692
4693     Py_ssize_t i_newpos;
4694     PyObject *restuple;
4695     PyObject *resunicode;
4696
4697     if (*errorHandler == NULL) {
4698         *errorHandler = PyCodec_LookupError(errors);
4699         if (*errorHandler == NULL)
4700             return NULL;
4701     }
4702
4703     make_translate_exception(exceptionObject,
4704         unicode, size, startpos, endpos, reason);
4705     if (*exceptionObject == NULL)
4706         return NULL;
4707
4708     restuple = PyObject_CallFunctionObjArgs(
4709         *errorHandler, *exceptionObject, NULL);
4710     if (restuple == NULL)
4711         return NULL;
4712     if (!PyTuple_Check(restuple)) {
4713         PyErr_Format(PyExc_TypeError, &argparse[4]);
4714         Py_DECREF(restuple);
4715         return NULL;
4716     }
4717     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4718         &resunicode, &i_newpos)) {
4719         Py_DECREF(restuple);
4720         return NULL;
4721     }
4722     if (i_newpos<0)
4723         *newpos = size+i_newpos;
4724     else
4725         *newpos = i_newpos;
4726     if (*newpos<0 || *newpos>size) {
4727         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4728         Py_DECREF(restuple);
4729         return NULL;
4730     }
4731     Py_INCREF(resunicode);
4732     Py_DECREF(restuple);
4733     return resunicode;
4734 }
4735
4736 /* Lookup the character ch in the mapping and put the result in result,
4737    which must be decrefed by the caller.
4738    Return 0 on success, -1 on error */
4739 static
4740 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4741 {
4742     PyObject *w = PyInt_FromLong((long)c);
4743     PyObject *x;
4744
4745     if (w == NULL)
4746          return -1;
4747     x = PyObject_GetItem(mapping, w);
4748     Py_DECREF(w);
4749     if (x == NULL) {
4750         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4751             /* No mapping found means: use 1:1 mapping. */
4752             PyErr_Clear();
4753             *result = NULL;
4754             return 0;
4755         } else
4756             return -1;
4757     }
4758     else if (x == Py_None) {
4759         *result = x;
4760         return 0;
4761     }
4762     else if (PyInt_Check(x)) {
4763         long value = PyInt_AS_LONG(x);
4764         long max = PyUnicode_GetMax();
4765         if (value < 0 || value > max) {
4766             PyErr_Format(PyExc_TypeError,
4767                              "character mapping must be in range(0x%lx)", max+1);
4768             Py_DECREF(x);
4769             return -1;
4770         }
4771         *result = x;
4772         return 0;
4773     }
4774     else if (PyUnicode_Check(x)) {
4775         *result = x;
4776         return 0;
4777     }
4778     else {
4779         /* wrong return value */
4780         PyErr_SetString(PyExc_TypeError,
4781               "character mapping must return integer, None or unicode");
4782         Py_DECREF(x);
4783         return -1;
4784     }
4785 }
4786 /* ensure that *outobj is at least requiredsize characters long,
4787 if not reallocate and adjust various state variables.
4788 Return 0 on success, -1 on error */
4789 static
4790 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4791     Py_ssize_t requiredsize)
4792 {
4793     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4794     if (requiredsize > oldsize) {
4795         /* remember old output position */
4796         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4797         /* exponentially overallocate to minimize reallocations */
4798         if (requiredsize < 2 * oldsize)
4799             requiredsize = 2 * oldsize;
4800         if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4801             return -1;
4802         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4803     }
4804     return 0;
4805 }
4806 /* lookup the character, put the result in the output string and adjust
4807    various state variables. Return a new reference to the object that
4808    was put in the output buffer in *result, or Py_None, if the mapping was
4809    undefined (in which case no character was written).
4810    The called must decref result.
4811    Return 0 on success, -1 on error. */
4812 static
4813 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4814     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4815     PyObject **res)
4816 {
4817     if (charmaptranslate_lookup(*curinp, mapping, res))
4818         return -1;
4819     if (*res==NULL) {
4820         /* not found => default to 1:1 mapping */
4821         *(*outp)++ = *curinp;
4822     }
4823     else if (*res==Py_None)
4824         ;
4825     else if (PyInt_Check(*res)) {
4826         /* no overflow check, because we know that the space is enough */
4827         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4828     }
4829     else if (PyUnicode_Check(*res)) {
4830         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4831         if (repsize==1) {
4832             /* no overflow check, because we know that the space is enough */
4833             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4834         }
4835         else if (repsize!=0) {
4836             /* more than one character */
4837             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4838                 (insize - (curinp-startinp)) +
4839                 repsize - 1;
4840             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4841                 return -1;
4842             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4843             *outp += repsize;
4844         }
4845     }
4846     else
4847         return -1;
4848     return 0;
4849 }
4850
4851 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4852                                      Py_ssize_t size,
4853                                      PyObject *mapping,
4854                                      const char *errors)
4855 {
4856     /* output object */
4857     PyObject *res = NULL;
4858     /* pointers to the beginning and end+1 of input */
4859     const Py_UNICODE *startp = p;
4860     const Py_UNICODE *endp = p + size;
4861     /* pointer into the output */
4862     Py_UNICODE *str;
4863     /* current output position */
4864     Py_ssize_t respos = 0;
4865     char *reason = "character maps to <undefined>";
4866     PyObject *errorHandler = NULL;
4867     PyObject *exc = NULL;
4868     /* the following variable is used for caching string comparisons
4869      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4870      * 3=ignore, 4=xmlcharrefreplace */
4871     int known_errorHandler = -1;
4872
4873     if (mapping == NULL) {
4874         PyErr_BadArgument();
4875         return NULL;
4876     }
4877
4878     /* allocate enough for a simple 1:1 translation without
4879        replacements, if we need more, we'll resize */
4880     res = PyUnicode_FromUnicode(NULL, size);
4881     if (res == NULL)
4882         goto onError;
4883     if (size == 0)
4884         return res;
4885     str = PyUnicode_AS_UNICODE(res);
4886
4887     while (p<endp) {
4888         /* try to encode it */
4889         PyObject *x = NULL;
4890         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4891             Py_XDECREF(x);
4892             goto onError;
4893         }
4894         Py_XDECREF(x);
4895         if (x!=Py_None) /* it worked => adjust input pointer */
4896             ++p;
4897         else { /* untranslatable character */
4898             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4899             Py_ssize_t repsize;
4900             Py_ssize_t newpos;
4901             Py_UNICODE *uni2;
4902             /* startpos for collecting untranslatable chars */
4903             const Py_UNICODE *collstart = p;
4904             const Py_UNICODE *collend = p+1;
4905             const Py_UNICODE *coll;
4906
4907             /* find all untranslatable characters */
4908             while (collend < endp) {
4909                 if (charmaptranslate_lookup(*collend, mapping, &x))
4910                     goto onError;
4911                 Py_XDECREF(x);
4912                 if (x!=Py_None)
4913                     break;
4914                 ++collend;
4915             }
4916             /* cache callback name lookup
4917              * (if not done yet, i.e. it's the first error) */
4918             if (known_errorHandler==-1) {
4919                 if ((errors==NULL) || (!strcmp(errors, "strict")))
4920                     known_errorHandler = 1;
4921                 else if (!strcmp(errors, "replace"))
4922                     known_errorHandler = 2;
4923                 else if (!strcmp(errors, "ignore"))
4924                     known_errorHandler = 3;
4925                 else if (!strcmp(errors, "xmlcharrefreplace"))
4926                     known_errorHandler = 4;
4927                 else
4928                     known_errorHandler = 0;
4929             }
4930             switch (known_errorHandler) {
4931                 case 1: /* strict */
4932                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4933                     goto onError;
4934                 case 2: /* replace */
4935                     /* No need to check for space, this is a 1:1 replacement */
4936                     for (coll = collstart; coll<collend; ++coll)
4937                         *str++ = '?';
4938                     /* fall through */
4939                 case 3: /* ignore */
4940                     p = collend;
4941                     break;
4942                 case 4: /* xmlcharrefreplace */
4943                     /* generate replacement (temporarily (mis)uses p) */
4944                     for (p = collstart; p < collend; ++p) {
4945                         char buffer[2+29+1+1];
4946                         char *cp;
4947                         sprintf(buffer, "&#%d;", (int)*p);
4948                         if (charmaptranslate_makespace(&res, &str,
4949                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4950                             goto onError;
4951                         for (cp = buffer; *cp; ++cp)
4952                             *str++ = *cp;
4953                     }
4954                     p = collend;
4955                     break;
4956                 default:
4957                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4958                         reason, startp, size, &exc,
4959                         collstart-startp, collend-startp, &newpos);
4960                     if (repunicode == NULL)
4961                         goto onError;
4962                     /* generate replacement  */
4963                     repsize = PyUnicode_GET_SIZE(repunicode);
4964                     if (charmaptranslate_makespace(&res, &str,
4965                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4966                         Py_DECREF(repunicode);
4967                         goto onError;
4968                     }
4969                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4970                         *str++ = *uni2;
4971                     p = startp + newpos;
4972                     Py_DECREF(repunicode);
4973             }
4974         }
4975     }
4976     /* Resize if we allocated to much */
4977     respos = str-PyUnicode_AS_UNICODE(res);
4978     if (respos<PyUnicode_GET_SIZE(res)) {
4979         if (_PyUnicode_Resize(&res, respos) < 0)
4980             goto onError;
4981     }
4982     Py_XDECREF(exc);
4983     Py_XDECREF(errorHandler);
4984     return res;
4985
4986     onError:
4987     Py_XDECREF(res);
4988     Py_XDECREF(exc);
4989     Py_XDECREF(errorHandler);
4990     return NULL;
4991 }
4992
4993 PyObject *PyUnicode_Translate(PyObject *str,
4994                               PyObject *mapping,
4995                               const char *errors)
4996 {
4997     PyObject *result;
4998
4999     str = PyUnicode_FromObject(str);
5000     if (str == NULL)
5001         goto onError;
5002     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5003                                         PyUnicode_GET_SIZE(str),
5004                                         mapping,
5005                                         errors);
5006     Py_DECREF(str);
5007     return result;
5008
5009  onError:
5010     Py_XDECREF(str);
5011     return NULL;
5012 }
5013
5014 /* --- Decimal Encoder ---------------------------------------------------- */
5015
5016 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5017                             Py_ssize_t length,
5018                             char *output,
5019                             const char *errors)
5020 {
5021     Py_UNICODE *p, *end;
5022     PyObject *errorHandler = NULL;
5023     PyObject *exc = NULL;
5024     const char *encoding = "decimal";
5025     const char *reason = "invalid decimal Unicode string";
5026     /* the following variable is used for caching string comparisons
5027      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5028     int known_errorHandler = -1;
5029
5030     if (output == NULL) {
5031         PyErr_BadArgument();
5032         return -1;
5033     }
5034
5035     p = s;
5036     end = s + length;
5037     while (p < end) {
5038         register Py_UNICODE ch = *p;
5039         int decimal;
5040         PyObject *repunicode;
5041         Py_ssize_t repsize;
5042         Py_ssize_t newpos;
5043         Py_UNICODE *uni2;
5044         Py_UNICODE *collstart;
5045         Py_UNICODE *collend;
5046
5047         if (Py_UNICODE_ISSPACE(ch)) {
5048             *output++ = ' ';
5049             ++p;
5050             continue;
5051         }
5052         decimal = Py_UNICODE_TODECIMAL(ch);
5053         if (decimal >= 0) {
5054             *output++ = '0' + decimal;
5055             ++p;
5056             continue;
5057         }
5058         if (0 < ch && ch < 256) {
5059             *output++ = (char)ch;
5060             ++p;
5061             continue;
5062         }
5063         /* All other characters are considered unencodable */
5064         collstart = p;
5065         collend = p+1;
5066         while (collend < end) {
5067             if ((0 < *collend && *collend < 256) ||
5068                 !Py_UNICODE_ISSPACE(*collend) ||
5069                 Py_UNICODE_TODECIMAL(*collend))
5070                 break;
5071         }
5072         /* cache callback name lookup
5073          * (if not done yet, i.e. it's the first error) */
5074         if (known_errorHandler==-1) {
5075             if ((errors==NULL) || (!strcmp(errors, "strict")))
5076                 known_errorHandler = 1;
5077             else if (!strcmp(errors, "replace"))
5078                 known_errorHandler = 2;
5079             else if (!strcmp(errors, "ignore"))
5080                 known_errorHandler = 3;
5081             else if (!strcmp(errors, "xmlcharrefreplace"))
5082                 known_errorHandler = 4;
5083             else
5084                 known_errorHandler = 0;
5085         }
5086         switch (known_errorHandler) {
5087             case 1: /* strict */
5088                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5089                 goto onError;
5090             case 2: /* replace */
5091                 for (p = collstart; p < collend; ++p)
5092                     *output++ = '?';
5093                 /* fall through */
5094             case 3: /* ignore */
5095                 p = collend;
5096                 break;
5097             case 4: /* xmlcharrefreplace */
5098                 /* generate replacement (temporarily (mis)uses p) */
5099                 for (p = collstart; p < collend; ++p)
5100                     output += sprintf(output, "&#%d;", (int)*p);
5101                 p = collend;
5102                 break;
5103             default:
5104                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5105                     encoding, reason, s, length, &exc,
5106                     collstart-s, collend-s, &newpos);
5107                 if (repunicode == NULL)
5108                     goto onError;
5109                 /* generate replacement  */
5110                 repsize = PyUnicode_GET_SIZE(repunicode);
5111                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5112                     Py_UNICODE ch = *uni2;
5113                     if (Py_UNICODE_ISSPACE(ch))
5114                         *output++ = ' ';
5115                     else {
5116                         decimal = Py_UNICODE_TODECIMAL(ch);
5117                         if (decimal >= 0)
5118                             *output++ = '0' + decimal;
5119                         else if (0 < ch && ch < 256)
5120                             *output++ = (char)ch;
5121                         else {
5122                             Py_DECREF(repunicode);
5123                             raise_encode_exception(&exc, encoding,
5124                                 s, length, collstart-s, collend-s, reason);
5125                             goto onError;
5126                         }
5127                     }
5128                 }
5129                 p = s + newpos;
5130                 Py_DECREF(repunicode);
5131         }
5132     }
5133     /* 0-terminate the output string */
5134     *output++ = '\0';
5135     Py_XDECREF(exc);
5136     Py_XDECREF(errorHandler);
5137     return 0;
5138
5139  onError:
5140     Py_XDECREF(exc);
5141     Py_XDECREF(errorHandler);
5142     return -1;
5143 }
5144
5145 /* --- Helpers ------------------------------------------------------------ */
5146
5147 #include "stringlib/unicodedefs.h"
5148
5149 #define FROM_UNICODE
5150
5151 #include "stringlib/fastsearch.h"
5152
5153 #include "stringlib/count.h"
5154 #include "stringlib/find.h"
5155 #include "stringlib/partition.h"
5156
5157 /* helper macro to fixup start/end slice values */
5158 #define FIX_START_END(obj)                      \
5159     if (start < 0)                              \
5160         start += (obj)->length;                 \
5161     if (start < 0)                              \
5162         start = 0;                              \
5163     if (end > (obj)->length)                    \
5164         end = (obj)->length;                    \
5165     if (end < 0)                                \
5166         end += (obj)->length;                   \
5167     if (end < 0)                                \
5168         end = 0;
5169
5170 Py_ssize_t PyUnicode_Count(PyObject *str,
5171                            PyObject *substr,
5172                            Py_ssize_t start,
5173                            Py_ssize_t end)
5174 {
5175     Py_ssize_t result;
5176     PyUnicodeObject* str_obj;
5177     PyUnicodeObject* sub_obj;
5178
5179     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5180     if (!str_obj)
5181         return -1;
5182     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5183     if (!sub_obj) {
5184         Py_DECREF(str_obj);
5185         return -1;
5186     }
5187
5188     FIX_START_END(str_obj);
5189
5190     result = stringlib_count(
5191         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5192         );
5193
5194     Py_DECREF(sub_obj);
5195     Py_DECREF(str_obj);
5196
5197     return result;
5198 }
5199
5200 Py_ssize_t PyUnicode_Find(PyObject *str,
5201                           PyObject *sub,
5202                           Py_ssize_t start,
5203                           Py_ssize_t end,
5204                           int direction)
5205 {
5206     Py_ssize_t result;
5207
5208     str = PyUnicode_FromObject(str);
5209     if (!str)
5210         return -2;
5211     sub = PyUnicode_FromObject(sub);
5212     if (!sub) {
5213         Py_DECREF(str);
5214         return -2;
5215     }
5216
5217     if (direction > 0)
5218         result = stringlib_find_slice(
5219             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5220             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5221             start, end
5222             );
5223     else
5224         result = stringlib_rfind_slice(
5225             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5226             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5227             start, end
5228             );
5229
5230     Py_DECREF(str);
5231     Py_DECREF(sub);
5232
5233     return result;
5234 }
5235
5236 static
5237 int tailmatch(PyUnicodeObject *self,
5238               PyUnicodeObject *substring,
5239               Py_ssize_t start,
5240               Py_ssize_t end,
5241               int direction)
5242 {
5243     if (substring->length == 0)
5244         return 1;
5245
5246     FIX_START_END(self);
5247
5248     end -= substring->length;
5249     if (end < start)
5250         return 0;
5251
5252     if (direction > 0) {
5253         if (Py_UNICODE_MATCH(self, end, substring))
5254             return 1;
5255     } else {
5256         if (Py_UNICODE_MATCH(self, start, substring))
5257             return 1;
5258     }
5259
5260     return 0;
5261 }
5262
5263 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5264                         PyObject *substr,
5265                         Py_ssize_t start,
5266                         Py_ssize_t end,
5267                         int direction)
5268 {
5269     Py_ssize_t result;
5270
5271     str = PyUnicode_FromObject(str);
5272     if (str == NULL)
5273         return -1;
5274     substr = PyUnicode_FromObject(substr);
5275     if (substr == NULL) {
5276         Py_DECREF(str);
5277         return -1;
5278     }
5279
5280     result = tailmatch((PyUnicodeObject *)str,
5281                        (PyUnicodeObject *)substr,
5282                        start, end, direction);
5283     Py_DECREF(str);
5284     Py_DECREF(substr);
5285     return result;
5286 }
5287
5288 /* Apply fixfct filter to the Unicode object self and return a
5289    reference to the modified object */
5290
5291 static
5292 PyObject *fixup(PyUnicodeObject *self,
5293                 int (*fixfct)(PyUnicodeObject *s))
5294 {
5295
5296     PyUnicodeObject *u;
5297
5298     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5299     if (u == NULL)
5300         return NULL;
5301
5302     Py_UNICODE_COPY(u->str, self->str, self->length);
5303
5304     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5305         /* fixfct should return TRUE if it modified the buffer. If
5306            FALSE, return a reference to the original buffer instead
5307            (to save space, not time) */
5308         Py_INCREF(self);
5309         Py_DECREF(u);
5310         return (PyObject*) self;
5311     }
5312     return (PyObject*) u;
5313 }
5314
5315 static
5316 int fixupper(PyUnicodeObject *self)
5317 {
5318     Py_ssize_t len = self->length;
5319     Py_UNICODE *s = self->str;
5320     int status = 0;
5321
5322     while (len-- > 0) {
5323         register Py_UNICODE ch;
5324
5325         ch = Py_UNICODE_TOUPPER(*s);
5326         if (ch != *s) {
5327             status = 1;
5328             *s = ch;
5329         }
5330         s++;
5331     }
5332
5333     return status;
5334 }
5335
5336 static
5337 int fixlower(PyUnicodeObject *self)
5338 {
5339     Py_ssize_t len = self->length;
5340     Py_UNICODE *s = self->str;
5341     int status = 0;
5342
5343     while (len-- > 0) {
5344         register Py_UNICODE ch;
5345
5346         ch = Py_UNICODE_TOLOWER(*s);
5347         if (ch != *s) {
5348             status = 1;
5349             *s = ch;
5350         }
5351         s++;
5352     }
5353
5354     return status;
5355 }
5356
5357 static
5358 int fixswapcase(PyUnicodeObject *self)
5359 {
5360     Py_ssize_t len = self->length;
5361     Py_UNICODE *s = self->str;
5362     int status = 0;
5363
5364     while (len-- > 0) {
5365         if (Py_UNICODE_ISUPPER(*s)) {
5366             *s = Py_UNICODE_TOLOWER(*s);
5367             status = 1;
5368         } else if (Py_UNICODE_ISLOWER(*s)) {
5369             *s = Py_UNICODE_TOUPPER(*s);
5370             status = 1;
5371         }
5372         s++;
5373     }
5374
5375     return status;
5376 }
5377
5378 static
5379 int fixcapitalize(PyUnicodeObject *self)
5380 {
5381     Py_ssize_t len = self->length;
5382     Py_UNICODE *s = self->str;
5383     int status = 0;
5384
5385     if (len == 0)
5386         return 0;
5387     if (Py_UNICODE_ISLOWER(*s)) {
5388         *s = Py_UNICODE_TOUPPER(*s);
5389         status = 1;
5390     }
5391     s++;
5392     while (--len > 0) {
5393         if (Py_UNICODE_ISUPPER(*s)) {
5394             *s = Py_UNICODE_TOLOWER(*s);
5395             status = 1;
5396         }
5397         s++;
5398     }
5399     return status;
5400 }
5401
5402 static
5403 int fixtitle(PyUnicodeObject *self)
5404 {
5405     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5406     register Py_UNICODE *e;
5407     int previous_is_cased;
5408
5409     /* Shortcut for single character strings */
5410     if (PyUnicode_GET_SIZE(self) == 1) {
5411         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5412         if (*p != ch) {
5413             *p = ch;
5414             return 1;
5415         }
5416         else
5417             return 0;
5418     }
5419
5420     e = p + PyUnicode_GET_SIZE(self);
5421     previous_is_cased = 0;
5422     for (; p < e; p++) {
5423         register const Py_UNICODE ch = *p;
5424
5425         if (previous_is_cased)
5426             *p = Py_UNICODE_TOLOWER(ch);
5427         else
5428             *p = Py_UNICODE_TOTITLE(ch);
5429
5430         if (Py_UNICODE_ISLOWER(ch) ||
5431             Py_UNICODE_ISUPPER(ch) ||
5432             Py_UNICODE_ISTITLE(ch))
5433             previous_is_cased = 1;
5434         else
5435             previous_is_cased = 0;
5436     }
5437     return 1;
5438 }
5439
5440 PyObject *
5441 PyUnicode_Join(PyObject *separator, PyObject *seq)
5442 {
5443     PyObject *internal_separator = NULL;
5444     const Py_UNICODE blank = ' ';
5445     const Py_UNICODE *sep = &blank;
5446     Py_ssize_t seplen = 1;
5447     PyUnicodeObject *res = NULL; /* the result */
5448     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5449     Py_ssize_t res_used;         /* # used bytes */
5450     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5451     PyObject *fseq;          /* PySequence_Fast(seq) */
5452     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5453     PyObject *item;
5454     Py_ssize_t i;
5455
5456     fseq = PySequence_Fast(seq, "");
5457     if (fseq == NULL) {
5458         return NULL;
5459     }
5460
5461     /* Grrrr.  A codec may be invoked to convert str objects to
5462      * Unicode, and so it's possible to call back into Python code
5463      * during PyUnicode_FromObject(), and so it's possible for a sick
5464      * codec to change the size of fseq (if seq is a list).  Therefore
5465      * we have to keep refetching the size -- can't assume seqlen
5466      * is invariant.
5467      */
5468     seqlen = PySequence_Fast_GET_SIZE(fseq);
5469     /* If empty sequence, return u"". */
5470     if (seqlen == 0) {
5471         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5472         goto Done;
5473     }
5474     /* If singleton sequence with an exact Unicode, return that. */
5475     if (seqlen == 1) {
5476         item = PySequence_Fast_GET_ITEM(fseq, 0);
5477         if (PyUnicode_CheckExact(item)) {
5478             Py_INCREF(item);
5479             res = (PyUnicodeObject *)item;
5480             goto Done;
5481         }
5482     }
5483
5484     /* At least two items to join, or one that isn't exact Unicode. */
5485     if (seqlen > 1) {
5486         /* Set up sep and seplen -- they're needed. */
5487         if (separator == NULL) {
5488             sep = &blank;
5489             seplen = 1;
5490         }
5491         else {
5492             internal_separator = PyUnicode_FromObject(separator);
5493             if (internal_separator == NULL)
5494                 goto onError;
5495             sep = PyUnicode_AS_UNICODE(internal_separator);
5496             seplen = PyUnicode_GET_SIZE(internal_separator);
5497             /* In case PyUnicode_FromObject() mutated seq. */
5498             seqlen = PySequence_Fast_GET_SIZE(fseq);
5499         }
5500     }
5501
5502     /* Get space. */
5503     res = _PyUnicode_New(res_alloc);
5504     if (res == NULL)
5505         goto onError;
5506     res_p = PyUnicode_AS_UNICODE(res);
5507     res_used = 0;
5508
5509     for (i = 0; i < seqlen; ++i) {
5510         Py_ssize_t itemlen;
5511         Py_ssize_t new_res_used;
5512
5513         item = PySequence_Fast_GET_ITEM(fseq, i);
5514         /* Convert item to Unicode. */
5515         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5516             PyErr_Format(PyExc_TypeError,
5517                          "sequence item %zd: expected string or Unicode,"
5518                          " %.80s found",
5519                          i, Py_TYPE(item)->tp_name);
5520             goto onError;
5521         }
5522         item = PyUnicode_FromObject(item);
5523         if (item == NULL)
5524             goto onError;
5525         /* We own a reference to item from here on. */
5526
5527         /* In case PyUnicode_FromObject() mutated seq. */
5528         seqlen = PySequence_Fast_GET_SIZE(fseq);
5529
5530         /* Make sure we have enough space for the separator and the item. */
5531         itemlen = PyUnicode_GET_SIZE(item);
5532         new_res_used = res_used + itemlen;
5533         if (new_res_used < 0)
5534             goto Overflow;
5535         if (i < seqlen - 1) {
5536             new_res_used += seplen;
5537             if (new_res_used < 0)
5538                 goto Overflow;
5539         }
5540         if (new_res_used > res_alloc) {
5541             /* double allocated size until it's big enough */
5542             do {
5543                 res_alloc += res_alloc;
5544                 if (res_alloc <= 0)
5545                     goto Overflow;
5546             } while (new_res_used > res_alloc);
5547             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5548                 Py_DECREF(item);
5549                 goto onError;
5550             }
5551             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5552         }
5553
5554         /* Copy item, and maybe the separator. */
5555         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5556         res_p += itemlen;
5557         if (i < seqlen - 1) {
5558             Py_UNICODE_COPY(res_p, sep, seplen);
5559             res_p += seplen;
5560         }
5561         Py_DECREF(item);
5562         res_used = new_res_used;
5563     }
5564
5565     /* Shrink res to match the used area; this probably can't fail,
5566      * but it's cheap to check.
5567      */
5568     if (_PyUnicode_Resize(&res, res_used) < 0)
5569         goto onError;
5570
5571  Done:
5572     Py_XDECREF(internal_separator);
5573     Py_DECREF(fseq);
5574     return (PyObject *)res;
5575
5576  Overflow:
5577     PyErr_SetString(PyExc_OverflowError,
5578                     "join() result is too long for a Python string");
5579     Py_DECREF(item);
5580     /* fall through */
5581
5582  onError:
5583     Py_XDECREF(internal_separator);
5584     Py_DECREF(fseq);
5585     Py_XDECREF(res);
5586     return NULL;
5587 }
5588
5589 static
5590 PyUnicodeObject *pad(PyUnicodeObject *self,
5591                      Py_ssize_t left,
5592                      Py_ssize_t right,
5593                      Py_UNICODE fill)
5594 {
5595     PyUnicodeObject *u;
5596
5597     if (left < 0)
5598         left = 0;
5599     if (right < 0)
5600         right = 0;
5601
5602     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5603         Py_INCREF(self);
5604         return self;
5605     }
5606
5607     if (left > PY_SSIZE_T_MAX - self->length ||
5608         right > PY_SSIZE_T_MAX - (left + self->length)) {
5609         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5610         return NULL;
5611     }
5612     u = _PyUnicode_New(left + self->length + right);
5613     if (u) {
5614         if (left)
5615             Py_UNICODE_FILL(u->str, fill, left);
5616         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5617         if (right)
5618             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5619     }
5620
5621     return u;
5622 }
5623
5624 #define SPLIT_APPEND(data, left, right)                                 \
5625         str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5626         if (!str)                                                       \
5627             goto onError;                                               \
5628         if (PyList_Append(list, str)) {                                 \
5629             Py_DECREF(str);                                             \
5630             goto onError;                                               \
5631         }                                                               \
5632         else                                                            \
5633             Py_DECREF(str);
5634
5635 static
5636 PyObject *split_whitespace(PyUnicodeObject *self,
5637                            PyObject *list,
5638                            Py_ssize_t maxcount)
5639 {
5640     register Py_ssize_t i;
5641     register Py_ssize_t j;
5642     Py_ssize_t len = self->length;
5643     PyObject *str;
5644     register const Py_UNICODE *buf = self->str;
5645
5646     for (i = j = 0; i < len; ) {
5647         /* find a token */
5648         while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5649             i++;
5650         j = i;
5651         while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5652             i++;
5653         if (j < i) {
5654             if (maxcount-- <= 0)
5655                 break;
5656             SPLIT_APPEND(buf, j, i);
5657             while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5658                 i++;
5659             j = i;
5660         }
5661     }
5662     if (j < len) {
5663         SPLIT_APPEND(buf, j, len);
5664     }
5665     return list;
5666
5667  onError:
5668     Py_DECREF(list);
5669     return NULL;
5670 }
5671
5672 PyObject *PyUnicode_Splitlines(PyObject *string,
5673                                int keepends)
5674 {
5675     register Py_ssize_t i;
5676     register Py_ssize_t j;
5677     Py_ssize_t len;
5678     PyObject *list;
5679     PyObject *str;
5680     Py_UNICODE *data;
5681
5682     string = PyUnicode_FromObject(string);
5683     if (string == NULL)
5684         return NULL;
5685     data = PyUnicode_AS_UNICODE(string);
5686     len = PyUnicode_GET_SIZE(string);
5687
5688     list = PyList_New(0);
5689     if (!list)
5690         goto onError;
5691
5692     for (i = j = 0; i < len; ) {
5693         Py_ssize_t eol;
5694
5695         /* Find a line and append it */
5696         while (i < len && !BLOOM_LINEBREAK(data[i]))
5697             i++;
5698
5699         /* Skip the line break reading CRLF as one line break */
5700         eol = i;
5701         if (i < len) {
5702             if (data[i] == '\r' && i + 1 < len &&
5703                 data[i+1] == '\n')
5704                 i += 2;
5705             else
5706                 i++;
5707             if (keepends)
5708                 eol = i;
5709         }
5710         SPLIT_APPEND(data, j, eol);
5711         j = i;
5712     }
5713     if (j < len) {
5714         SPLIT_APPEND(data, j, len);
5715     }
5716
5717     Py_DECREF(string);
5718     return list;
5719
5720  onError:
5721     Py_XDECREF(list);
5722     Py_DECREF(string);
5723     return NULL;
5724 }
5725
5726 static
5727 PyObject *split_char(PyUnicodeObject *self,
5728                      PyObject *list,
5729                      Py_UNICODE ch,
5730                      Py_ssize_t maxcount)
5731 {
5732     register Py_ssize_t i;
5733     register Py_ssize_t j;
5734     Py_ssize_t len = self->length;
5735     PyObject *str;
5736     register const Py_UNICODE *buf = self->str;
5737
5738     for (i = j = 0; i < len; ) {
5739         if (buf[i] == ch) {
5740             if (maxcount-- <= 0)
5741                 break;
5742             SPLIT_APPEND(buf, j, i);
5743             i = j = i + 1;
5744         } else
5745             i++;
5746     }
5747     if (j <= len) {
5748         SPLIT_APPEND(buf, j, len);
5749     }
5750     return list;
5751
5752  onError:
5753     Py_DECREF(list);
5754     return NULL;
5755 }
5756
5757 static
5758 PyObject *split_substring(PyUnicodeObject *self,
5759                           PyObject *list,
5760                           PyUnicodeObject *substring,
5761                           Py_ssize_t maxcount)
5762 {
5763     register Py_ssize_t i;
5764     register Py_ssize_t j;
5765     Py_ssize_t len = self->length;
5766     Py_ssize_t sublen = substring->length;
5767     PyObject *str;
5768
5769     for (i = j = 0; i <= len - sublen; ) {
5770         if (Py_UNICODE_MATCH(self, i, substring)) {
5771             if (maxcount-- <= 0)
5772                 break;
5773             SPLIT_APPEND(self->str, j, i);
5774             i = j = i + sublen;
5775         } else
5776             i++;
5777     }
5778     if (j <= len) {
5779         SPLIT_APPEND(self->str, j, len);
5780     }
5781     return list;
5782
5783  onError:
5784     Py_DECREF(list);
5785     return NULL;
5786 }
5787
5788 static
5789 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5790                             PyObject *list,
5791                             Py_ssize_t maxcount)
5792 {
5793     register Py_ssize_t i;
5794     register Py_ssize_t j;
5795     Py_ssize_t len = self->length;
5796     PyObject *str;
5797     register const Py_UNICODE *buf = self->str;
5798
5799     for (i = j = len - 1; i >= 0; ) {
5800         /* find a token */
5801         while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5802             i--;
5803         j = i;
5804         while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5805             i--;
5806         if (j > i) {
5807             if (maxcount-- <= 0)
5808                 break;
5809             SPLIT_APPEND(buf, i + 1, j + 1);
5810             while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5811                 i--;
5812             j = i;
5813         }
5814     }
5815     if (j >= 0) {
5816         SPLIT_APPEND(buf, 0, j + 1);
5817     }
5818     if (PyList_Reverse(list) < 0)
5819         goto onError;
5820     return list;
5821
5822  onError:
5823     Py_DECREF(list);
5824     return NULL;
5825 }
5826
5827 static
5828 PyObject *rsplit_char(PyUnicodeObject *self,
5829                       PyObject *list,
5830                       Py_UNICODE ch,
5831                       Py_ssize_t maxcount)
5832 {
5833     register Py_ssize_t i;
5834     register Py_ssize_t j;
5835     Py_ssize_t len = self->length;
5836     PyObject *str;
5837     register const Py_UNICODE *buf = self->str;
5838
5839     for (i = j = len - 1; i >= 0; ) {
5840         if (buf[i] == ch) {
5841             if (maxcount-- <= 0)
5842                 break;
5843             SPLIT_APPEND(buf, i + 1, j + 1);
5844             j = i = i - 1;
5845         } else
5846             i--;
5847     }
5848     if (j >= -1) {
5849         SPLIT_APPEND(buf, 0, j + 1);
5850     }
5851     if (PyList_Reverse(list) < 0)
5852         goto onError;
5853     return list;
5854
5855  onError:
5856     Py_DECREF(list);
5857     return NULL;
5858 }
5859
5860 static
5861 PyObject *rsplit_substring(PyUnicodeObject *self,
5862                            PyObject *list,
5863                            PyUnicodeObject *substring,
5864                            Py_ssize_t maxcount)
5865 {
5866     register Py_ssize_t i;
5867     register Py_ssize_t j;
5868     Py_ssize_t len = self->length;
5869     Py_ssize_t sublen = substring->length;
5870     PyObject *str;
5871
5872     for (i = len - sublen, j = len; i >= 0; ) {
5873         if (Py_UNICODE_MATCH(self, i, substring)) {
5874             if (maxcount-- <= 0)
5875                 break;
5876             SPLIT_APPEND(self->str, i + sublen, j);
5877             j = i;
5878             i -= sublen;
5879         } else
5880             i--;
5881     }
5882     if (j >= 0) {
5883         SPLIT_APPEND(self->str, 0, j);
5884     }
5885     if (PyList_Reverse(list) < 0)
5886         goto onError;
5887     return list;
5888
5889  onError:
5890     Py_DECREF(list);
5891     return NULL;
5892 }
5893
5894 #undef SPLIT_APPEND
5895
5896 static
5897 PyObject *split(PyUnicodeObject *self,
5898                 PyUnicodeObject *substring,
5899                 Py_ssize_t maxcount)
5900 {
5901     PyObject *list;
5902
5903     if (maxcount < 0)
5904         maxcount = PY_SSIZE_T_MAX;
5905
5906     list = PyList_New(0);
5907     if (!list)
5908         return NULL;
5909
5910     if (substring == NULL)
5911         return split_whitespace(self,list,maxcount);
5912
5913     else if (substring->length == 1)
5914         return split_char(self,list,substring->str[0],maxcount);
5915
5916     else if (substring->length == 0) {
5917         Py_DECREF(list);
5918         PyErr_SetString(PyExc_ValueError, "empty separator");
5919         return NULL;
5920     }
5921     else
5922         return split_substring(self,list,substring,maxcount);
5923 }
5924
5925 static
5926 PyObject *rsplit(PyUnicodeObject *self,
5927                  PyUnicodeObject *substring,
5928                  Py_ssize_t maxcount)
5929 {
5930     PyObject *list;
5931
5932     if (maxcount < 0)
5933         maxcount = PY_SSIZE_T_MAX;
5934
5935     list = PyList_New(0);
5936     if (!list)
5937         return NULL;
5938
5939     if (substring == NULL)
5940         return rsplit_whitespace(self,list,maxcount);
5941
5942     else if (substring->length == 1)
5943         return rsplit_char(self,list,substring->str[0],maxcount);
5944
5945     else if (substring->length == 0) {
5946         Py_DECREF(list);
5947         PyErr_SetString(PyExc_ValueError, "empty separator");
5948         return NULL;
5949     }
5950     else
5951         return rsplit_substring(self,list,substring,maxcount);
5952 }
5953
5954 static
5955 PyObject *replace(PyUnicodeObject *self,
5956                   PyUnicodeObject *str1,
5957                   PyUnicodeObject *str2,
5958                   Py_ssize_t maxcount)
5959 {
5960     PyUnicodeObject *u;
5961
5962     if (maxcount < 0)
5963         maxcount = PY_SSIZE_T_MAX;
5964
5965     if (str1->length == str2->length) {
5966         /* same length */
5967         Py_ssize_t i;
5968         if (str1->length == 1) {
5969             /* replace characters */
5970             Py_UNICODE u1, u2;
5971             if (!findchar(self->str, self->length, str1->str[0]))
5972                 goto nothing;
5973             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5974             if (!u)
5975                 return NULL;
5976             Py_UNICODE_COPY(u->str, self->str, self->length);
5977             u1 = str1->str[0];
5978             u2 = str2->str[0];
5979             for (i = 0; i < u->length; i++)
5980                 if (u->str[i] == u1) {
5981                     if (--maxcount < 0)
5982                         break;
5983                     u->str[i] = u2;
5984                 }
5985         } else {
5986             i = fastsearch(
5987                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5988                 );
5989             if (i < 0)
5990                 goto nothing;
5991             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5992             if (!u)
5993                 return NULL;
5994             Py_UNICODE_COPY(u->str, self->str, self->length);
5995             while (i <= self->length - str1->length)
5996                 if (Py_UNICODE_MATCH(self, i, str1)) {
5997                     if (--maxcount < 0)
5998                         break;
5999                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6000                     i += str1->length;
6001                 } else
6002                     i++;
6003         }
6004     } else {
6005
6006         Py_ssize_t n, i, j, e;
6007         Py_ssize_t product, new_size, delta;
6008         Py_UNICODE *p;
6009
6010         /* replace strings */
6011         n = stringlib_count(self->str, self->length, str1->str, str1->length);
6012         if (n > maxcount)
6013             n = maxcount;
6014         if (n == 0)
6015             goto nothing;
6016         /* new_size = self->length + n * (str2->length - str1->length)); */
6017         delta = (str2->length - str1->length);
6018         if (delta == 0) {
6019             new_size = self->length;
6020         } else {
6021             product = n * (str2->length - str1->length);
6022             if ((product / (str2->length - str1->length)) != n) {
6023                 PyErr_SetString(PyExc_OverflowError,
6024                                 "replace string is too long");
6025                 return NULL;
6026             }
6027             new_size = self->length + product;
6028             if (new_size < 0) {
6029                 PyErr_SetString(PyExc_OverflowError,
6030                                 "replace string is too long");
6031                 return NULL;
6032             }
6033         }
6034         u = _PyUnicode_New(new_size);
6035         if (!u)
6036             return NULL;
6037         i = 0;
6038         p = u->str;
6039         e = self->length - str1->length;
6040         if (str1->length > 0) {
6041             while (n-- > 0) {
6042                 /* look for next match */
6043                 j = i;
6044                 while (j <= e) {
6045                     if (Py_UNICODE_MATCH(self, j, str1))
6046                         break;
6047                     j++;
6048                 }
6049                 if (j > i) {
6050                     if (j > e)
6051                         break;
6052                     /* copy unchanged part [i:j] */
6053                     Py_UNICODE_COPY(p, self->str+i, j-i);
6054                     p += j - i;
6055                 }
6056                 /* copy substitution string */
6057                 if (str2->length > 0) {
6058                     Py_UNICODE_COPY(p, str2->str, str2->length);
6059                     p += str2->length;
6060                 }
6061                 i = j + str1->length;
6062             }
6063             if (i < self->length)
6064                 /* copy tail [i:] */
6065                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6066         } else {
6067             /* interleave */
6068             while (n > 0) {
6069                 Py_UNICODE_COPY(p, str2->str, str2->length);
6070                 p += str2->length;
6071                 if (--n <= 0)
6072                     break;
6073                 *p++ = self->str[i++];
6074             }
6075             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6076         }
6077     }
6078     return (PyObject *) u;
6079
6080 nothing:
6081     /* nothing to replace; return original string (when possible) */
6082     if (PyUnicode_CheckExact(self)) {
6083         Py_INCREF(self);
6084         return (PyObject *) self;
6085     }
6086     return PyUnicode_FromUnicode(self->str, self->length);
6087 }
6088
6089 /* --- Unicode Object Methods --------------------------------------------- */
6090
6091 PyDoc_STRVAR(title__doc__,
6092 "S.title() -> unicode\n\
6093 \n\
6094 Return a titlecased version of S, i.e. words start with title case\n\
6095 characters, all remaining cased characters have lower case.");
6096
6097 static PyObject*
6098 unicode_title(PyUnicodeObject *self)
6099 {
6100     return fixup(self, fixtitle);
6101 }
6102
6103 PyDoc_STRVAR(capitalize__doc__,
6104 "S.capitalize() -> unicode\n\
6105 \n\
6106 Return a capitalized version of S, i.e. make the first character\n\
6107 have upper case.");
6108
6109 static PyObject*
6110 unicode_capitalize(PyUnicodeObject *self)
6111 {
6112     return fixup(self, fixcapitalize);
6113 }
6114
6115 #if 0
6116 PyDoc_STRVAR(capwords__doc__,
6117 "S.capwords() -> unicode\n\
6118 \n\
6119 Apply .capitalize() to all words in S and return the result with\n\
6120 normalized whitespace (all whitespace strings are replaced by ' ').");
6121
6122 static PyObject*
6123 unicode_capwords(PyUnicodeObject *self)
6124 {
6125     PyObject *list;
6126     PyObject *item;
6127     Py_ssize_t i;
6128
6129     /* Split into words */
6130     list = split(self, NULL, -1);
6131     if (!list)
6132         return NULL;
6133
6134     /* Capitalize each word */
6135     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6136         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6137                      fixcapitalize);
6138         if (item == NULL)
6139             goto onError;
6140         Py_DECREF(PyList_GET_ITEM(list, i));
6141         PyList_SET_ITEM(list, i, item);
6142     }
6143
6144     /* Join the words to form a new string */
6145     item = PyUnicode_Join(NULL, list);
6146
6147 onError:
6148     Py_DECREF(list);
6149     return (PyObject *)item;
6150 }
6151 #endif
6152
6153 /* Argument converter.  Coerces to a single unicode character */
6154
6155 static int
6156 convert_uc(PyObject *obj, void *addr)
6157 {
6158         Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6159         PyObject *uniobj;
6160         Py_UNICODE *unistr;
6161
6162         uniobj = PyUnicode_FromObject(obj);
6163         if (uniobj == NULL) {
6164                 PyErr_SetString(PyExc_TypeError,
6165                         "The fill character cannot be converted to Unicode");
6166                 return 0;
6167         }
6168         if (PyUnicode_GET_SIZE(uniobj) != 1) {
6169                 PyErr_SetString(PyExc_TypeError,
6170                         "The fill character must be exactly one character long");
6171                 Py_DECREF(uniobj);
6172                 return 0;
6173         }
6174         unistr = PyUnicode_AS_UNICODE(uniobj);
6175         *fillcharloc = unistr[0];
6176         Py_DECREF(uniobj);
6177         return 1;
6178 }
6179
6180 PyDoc_STRVAR(center__doc__,
6181 "S.center(width[, fillchar]) -> unicode\n\
6182 \n\
6183 Return S centered in a Unicode string of length width. Padding is\n\
6184 done using the specified fill character (default is a space)");
6185
6186 static PyObject *
6187 unicode_center(PyUnicodeObject *self, PyObject *args)
6188 {
6189     Py_ssize_t marg, left;
6190     Py_ssize_t width;
6191     Py_UNICODE fillchar = ' ';
6192
6193     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6194         return NULL;
6195
6196     if (self->length >= width && PyUnicode_CheckExact(self)) {
6197         Py_INCREF(self);
6198         return (PyObject*) self;
6199     }
6200
6201     marg = width - self->length;
6202     left = marg / 2 + (marg & width & 1);
6203
6204     return (PyObject*) pad(self, left, marg - left, fillchar);
6205 }
6206
6207 #if 0
6208
6209 /* This code should go into some future Unicode collation support
6210    module. The basic comparison should compare ordinals on a naive
6211    basis (this is what Java does and thus JPython too). */
6212
6213 /* speedy UTF-16 code point order comparison */
6214 /* gleaned from: */
6215 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6216
6217 static short utf16Fixup[32] =
6218 {
6219     0, 0, 0, 0, 0, 0, 0, 0,
6220     0, 0, 0, 0, 0, 0, 0, 0,
6221     0, 0, 0, 0, 0, 0, 0, 0,
6222     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6223 };
6224
6225 static int
6226 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6227 {
6228     Py_ssize_t len1, len2;
6229
6230     Py_UNICODE *s1 = str1->str;
6231     Py_UNICODE *s2 = str2->str;
6232
6233     len1 = str1->length;
6234     len2 = str2->length;
6235
6236     while (len1 > 0 && len2 > 0) {
6237         Py_UNICODE c1, c2;
6238
6239         c1 = *s1++;
6240         c2 = *s2++;
6241
6242         if (c1 > (1<<11) * 26)
6243             c1 += utf16Fixup[c1>>11];
6244         if (c2 > (1<<11) * 26)
6245             c2 += utf16Fixup[c2>>11];
6246         /* now c1 and c2 are in UTF-32-compatible order */
6247
6248         if (c1 != c2)
6249             return (c1 < c2) ? -1 : 1;
6250
6251         len1--; len2--;
6252     }
6253
6254     return (len1 < len2) ? -1 : (len1 != len2);
6255 }
6256
6257 #else
6258
6259 static int
6260 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6261 {
6262     register Py_ssize_t len1, len2;
6263
6264     Py_UNICODE *s1 = str1->str;
6265     Py_UNICODE *s2 = str2->str;
6266
6267     len1 = str1->length;
6268     len2 = str2->length;
6269
6270     while (len1 > 0 && len2 > 0) {
6271         Py_UNICODE c1, c2;
6272
6273         c1 = *s1++;
6274         c2 = *s2++;
6275
6276         if (c1 != c2)
6277             return (c1 < c2) ? -1 : 1;
6278
6279         len1--; len2--;
6280     }
6281
6282     return (len1 < len2) ? -1 : (len1 != len2);
6283 }
6284
6285 #endif
6286
6287 int PyUnicode_Compare(PyObject *left,
6288                       PyObject *right)
6289 {
6290     PyUnicodeObject *u = NULL, *v = NULL;
6291     int result;
6292
6293     /* Coerce the two arguments */
6294     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6295     if (u == NULL)
6296         goto onError;
6297     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6298     if (v == NULL)
6299         goto onError;
6300
6301     /* Shortcut for empty or interned objects */
6302     if (v == u) {
6303         Py_DECREF(u);
6304         Py_DECREF(v);
6305         return 0;
6306     }
6307
6308     result = unicode_compare(u, v);
6309
6310     Py_DECREF(u);
6311     Py_DECREF(v);
6312     return result;
6313
6314 onError:
6315     Py_XDECREF(u);
6316     Py_XDECREF(v);
6317     return -1;
6318 }
6319
6320 PyObject *PyUnicode_RichCompare(PyObject *left,
6321                                 PyObject *right,
6322                                 int op)
6323 {
6324     int result;
6325
6326     result = PyUnicode_Compare(left, right);
6327     if (result == -1 && PyErr_Occurred())
6328         goto onError;
6329
6330     /* Convert the return value to a Boolean */
6331     switch (op) {
6332     case Py_EQ:
6333         result = (result == 0);
6334         break;
6335     case Py_NE:
6336         result = (result != 0);
6337         break;
6338     case Py_LE:
6339         result = (result <= 0);
6340         break;
6341     case Py_GE:
6342         result = (result >= 0);
6343         break;
6344     case Py_LT:
6345         result = (result == -1);
6346         break;
6347     case Py_GT:
6348         result = (result == 1);
6349         break;
6350     }
6351     return PyBool_FromLong(result);
6352
6353  onError:
6354
6355     /* Standard case
6356
6357        Type errors mean that PyUnicode_FromObject() could not convert
6358        one of the arguments (usually the right hand side) to Unicode,
6359        ie. we can't handle the comparison request. However, it is
6360        possible that the other object knows a comparison method, which
6361        is why we return Py_NotImplemented to give the other object a
6362        chance.
6363
6364     */
6365     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6366         PyErr_Clear();
6367         Py_INCREF(Py_NotImplemented);
6368         return Py_NotImplemented;
6369     }
6370     if (op != Py_EQ && op != Py_NE)
6371         return NULL;
6372
6373     /* Equality comparison.
6374
6375        This is a special case: we silence any PyExc_UnicodeDecodeError
6376        and instead turn it into a PyErr_UnicodeWarning.
6377
6378     */
6379     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6380         return NULL;
6381     PyErr_Clear();
6382     if (PyErr_Warn(PyExc_UnicodeWarning,
6383                    (op == Py_EQ) ?
6384                    "Unicode equal comparison "
6385                    "failed to convert both arguments to Unicode - "
6386                    "interpreting them as being unequal" :
6387                    "Unicode unequal comparison "
6388                    "failed to convert both arguments to Unicode - "
6389                    "interpreting them as being unequal"
6390                    ) < 0)
6391         return NULL;
6392     result = (op == Py_NE);
6393     return PyBool_FromLong(result);
6394 }
6395
6396 int PyUnicode_Contains(PyObject *container,
6397                        PyObject *element)
6398 {
6399     PyObject *str, *sub;
6400     int result;
6401
6402     /* Coerce the two arguments */
6403     sub = PyUnicode_FromObject(element);
6404     if (!sub) {
6405         PyErr_SetString(PyExc_TypeError,
6406             "'in <string>' requires string as left operand");
6407         return -1;
6408     }
6409
6410     str = PyUnicode_FromObject(container);
6411     if (!str) {
6412         Py_DECREF(sub);
6413         return -1;
6414     }
6415
6416     result = stringlib_contains_obj(str, sub);
6417
6418     Py_DECREF(str);
6419     Py_DECREF(sub);
6420
6421     return result;
6422 }
6423
6424 /* Concat to string or Unicode object giving a new Unicode object. */
6425
6426 PyObject *PyUnicode_Concat(PyObject *left,
6427                            PyObject *right)
6428 {
6429     PyUnicodeObject *u = NULL, *v = NULL, *w;
6430
6431     /* Coerce the two arguments */
6432     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6433     if (u == NULL)
6434         goto onError;
6435     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6436     if (v == NULL)
6437         goto onError;
6438
6439     /* Shortcuts */
6440     if (v == unicode_empty) {
6441         Py_DECREF(v);
6442         return (PyObject *)u;
6443     }
6444     if (u == unicode_empty) {
6445         Py_DECREF(u);
6446         return (PyObject *)v;
6447     }
6448
6449     /* Concat the two Unicode strings */
6450     w = _PyUnicode_New(u->length + v->length);
6451     if (w == NULL)
6452         goto onError;
6453     Py_UNICODE_COPY(w->str, u->str, u->length);
6454     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6455
6456     Py_DECREF(u);
6457     Py_DECREF(v);
6458     return (PyObject *)w;
6459
6460 onError:
6461     Py_XDECREF(u);
6462     Py_XDECREF(v);
6463     return NULL;
6464 }
6465
6466 PyDoc_STRVAR(count__doc__,
6467 "S.count(sub[, start[, end]]) -> int\n\
6468 \n\
6469 Return the number of non-overlapping occurrences of substring sub in\n\
6470 Unicode string S[start:end].  Optional arguments start and end are\n\
6471 interpreted as in slice notation.");
6472
6473 static PyObject *
6474 unicode_count(PyUnicodeObject *self, PyObject *args)
6475 {
6476     PyUnicodeObject *substring;
6477     Py_ssize_t start = 0;
6478     Py_ssize_t end = PY_SSIZE_T_MAX;
6479     PyObject *result;
6480
6481     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6482                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6483         return NULL;
6484
6485     substring = (PyUnicodeObject *)PyUnicode_FromObject(
6486         (PyObject *)substring);
6487     if (substring == NULL)
6488         return NULL;
6489
6490     FIX_START_END(self);
6491
6492     result = PyInt_FromSsize_t(
6493         stringlib_count(self->str + start, end - start,
6494                         substring->str, substring->length)
6495         );
6496
6497     Py_DECREF(substring);
6498
6499     return result;
6500 }
6501
6502 PyDoc_STRVAR(encode__doc__,
6503 "S.encode([encoding[,errors]]) -> string or unicode\n\
6504 \n\
6505 Encodes S using the codec registered for encoding. encoding defaults\n\
6506 to the default encoding. errors may be given to set a different error\n\
6507 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6508 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6509 'xmlcharrefreplace' as well as any other name registered with\n\
6510 codecs.register_error that can handle UnicodeEncodeErrors.");
6511
6512 static PyObject *
6513 unicode_encode(PyUnicodeObject *self, PyObject *args)
6514 {
6515     char *encoding = NULL;
6516     char *errors = NULL;
6517     PyObject *v;
6518
6519     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6520         return NULL;
6521     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6522     if (v == NULL)
6523         goto onError;
6524     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6525         PyErr_Format(PyExc_TypeError,
6526                      "encoder did not return a string/unicode object "
6527                      "(type=%.400s)",
6528                      Py_TYPE(v)->tp_name);
6529         Py_DECREF(v);
6530         return NULL;
6531     }
6532     return v;
6533
6534  onError:
6535     return NULL;
6536 }
6537
6538 PyDoc_STRVAR(decode__doc__,
6539 "S.decode([encoding[,errors]]) -> string or unicode\n\
6540 \n\
6541 Decodes S using the codec registered for encoding. encoding defaults\n\
6542 to the default encoding. errors may be given to set a different error\n\
6543 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6544 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6545 as well as any other name registerd with codecs.register_error that is\n\
6546 able to handle UnicodeDecodeErrors.");
6547
6548 static PyObject *
6549 unicode_decode(PyUnicodeObject *self, PyObject *args)
6550 {
6551     char *encoding = NULL;
6552     char *errors = NULL;
6553     PyObject *v;
6554
6555     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6556         return NULL;
6557     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6558     if (v == NULL)
6559         goto onError;
6560     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6561         PyErr_Format(PyExc_TypeError,
6562                      "decoder did not return a string/unicode object "
6563                      "(type=%.400s)",
6564                      Py_TYPE(v)->tp_name);
6565         Py_DECREF(v);
6566         return NULL;
6567     }
6568     return v;
6569
6570  onError:
6571     return NULL;
6572 }
6573
6574 PyDoc_STRVAR(expandtabs__doc__,
6575 "S.expandtabs([tabsize]) -> unicode\n\
6576 \n\
6577 Return a copy of S where all tab characters are expanded using spaces.\n\
6578 If tabsize is not given, a tab size of 8 characters is assumed.");
6579
6580 static PyObject*
6581 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6582 {
6583     Py_UNICODE *e;
6584     Py_UNICODE *p;
6585     Py_UNICODE *q;
6586     Py_UNICODE *qe;
6587     Py_ssize_t i, j, incr;
6588     PyUnicodeObject *u;
6589     int tabsize = 8;
6590
6591     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6592         return NULL;
6593
6594     /* First pass: determine size of output string */
6595     i = 0; /* chars up to and including most recent \n or \r */
6596     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6597     e = self->str + self->length; /* end of input */
6598     for (p = self->str; p < e; p++)
6599         if (*p == '\t') {
6600             if (tabsize > 0) {
6601                 incr = tabsize - (j % tabsize); /* cannot overflow */
6602                 if (j > PY_SSIZE_T_MAX - incr)
6603                     goto overflow1;
6604                 j += incr;
6605             }
6606         }
6607         else {
6608             if (j > PY_SSIZE_T_MAX - 1)
6609                 goto overflow1;
6610             j++;
6611             if (*p == '\n' || *p == '\r') {
6612                 if (i > PY_SSIZE_T_MAX - j)
6613                     goto overflow1;
6614                 i += j;
6615                 j = 0;
6616             }
6617         }
6618
6619     if (i > PY_SSIZE_T_MAX - j)
6620         goto overflow1;
6621
6622     /* Second pass: create output string and fill it */
6623     u = _PyUnicode_New(i + j);
6624     if (!u)
6625         return NULL;
6626
6627     j = 0; /* same as in first pass */
6628     q = u->str; /* next output char */
6629     qe = u->str + u->length; /* end of output */
6630
6631     for (p = self->str; p < e; p++)
6632         if (*p == '\t') {
6633             if (tabsize > 0) {
6634                 i = tabsize - (j % tabsize);
6635                 j += i;
6636                 while (i--) {
6637                     if (q >= qe)
6638                         goto overflow2;
6639                     *q++ = ' ';
6640                 }
6641             }
6642         }
6643         else {
6644             if (q >= qe)
6645                 goto overflow2;
6646             *q++ = *p;
6647             j++;
6648             if (*p == '\n' || *p == '\r')
6649                 j = 0;
6650         }
6651
6652     return (PyObject*) u;
6653
6654   overflow2:
6655     Py_DECREF(u);
6656   overflow1:
6657     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6658     return NULL;
6659 }
6660
6661 PyDoc_STRVAR(find__doc__,
6662 "S.find(sub [,start [,end]]) -> int\n\
6663 \n\
6664 Return the lowest index in S where substring sub is found,\n\
6665 such that sub is contained within s[start:end].  Optional\n\
6666 arguments start and end are interpreted as in slice notation.\n\
6667 \n\
6668 Return -1 on failure.");
6669
6670 static PyObject *
6671 unicode_find(PyUnicodeObject *self, PyObject *args)
6672 {
6673     PyObject *substring;
6674     Py_ssize_t start;
6675     Py_ssize_t end;
6676     Py_ssize_t result;
6677
6678     if (!_ParseTupleFinds(args, &substring, &start, &end))
6679         return NULL;
6680
6681     result = stringlib_find_slice(
6682         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6683         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6684         start, end
6685         );
6686
6687     Py_DECREF(substring);
6688
6689     return PyInt_FromSsize_t(result);
6690 }
6691
6692 static PyObject *
6693 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6694 {
6695     if (index < 0 || index >= self->length) {
6696         PyErr_SetString(PyExc_IndexError, "string index out of range");
6697         return NULL;
6698     }
6699
6700     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6701 }
6702
6703 static long
6704 unicode_hash(PyUnicodeObject *self)
6705 {
6706     /* Since Unicode objects compare equal to their ASCII string
6707        counterparts, they should use the individual character values
6708        as basis for their hash value.  This is needed to assure that
6709        strings and Unicode objects behave in the same way as
6710        dictionary keys. */
6711
6712     register Py_ssize_t len;
6713     register Py_UNICODE *p;
6714     register long x;
6715
6716     if (self->hash != -1)
6717         return self->hash;
6718     len = PyUnicode_GET_SIZE(self);
6719     p = PyUnicode_AS_UNICODE(self);
6720     x = *p << 7;
6721     while (--len >= 0)
6722         x = (1000003*x) ^ *p++;
6723     x ^= PyUnicode_GET_SIZE(self);
6724     if (x == -1)
6725         x = -2;
6726     self->hash = x;
6727     return x;
6728 }
6729
6730 PyDoc_STRVAR(index__doc__,
6731 "S.index(sub [,start [,end]]) -> int\n\
6732 \n\
6733 Like S.find() but raise ValueError when the substring is not found.");
6734
6735 static PyObject *
6736 unicode_index(PyUnicodeObject *self, PyObject *args)
6737 {
6738     Py_ssize_t result;
6739     PyObject *substring;
6740     Py_ssize_t start;
6741     Py_ssize_t end;
6742
6743     if (!_ParseTupleFinds(args, &substring, &start, &end))
6744         return NULL;
6745
6746     result = stringlib_find_slice(
6747         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6748         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6749         start, end
6750         );
6751
6752     Py_DECREF(substring);
6753
6754     if (result < 0) {
6755         PyErr_SetString(PyExc_ValueError, "substring not found");
6756         return NULL;
6757     }
6758
6759     return PyInt_FromSsize_t(result);
6760 }
6761
6762 PyDoc_STRVAR(islower__doc__,
6763 "S.islower() -> bool\n\
6764 \n\
6765 Return True if all cased characters in S are lowercase and there is\n\
6766 at least one cased character in S, False otherwise.");
6767
6768 static PyObject*
6769 unicode_islower(PyUnicodeObject *self)
6770 {
6771     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6772     register const Py_UNICODE *e;
6773     int cased;
6774
6775     /* Shortcut for single character strings */
6776     if (PyUnicode_GET_SIZE(self) == 1)
6777         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6778
6779     /* Special case for empty strings */
6780     if (PyUnicode_GET_SIZE(self) == 0)
6781         return PyBool_FromLong(0);
6782
6783     e = p + PyUnicode_GET_SIZE(self);
6784     cased = 0;
6785     for (; p < e; p++) {
6786         register const Py_UNICODE ch = *p;
6787
6788         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6789             return PyBool_FromLong(0);
6790         else if (!cased && Py_UNICODE_ISLOWER(ch))
6791             cased = 1;
6792     }
6793     return PyBool_FromLong(cased);
6794 }
6795
6796 PyDoc_STRVAR(isupper__doc__,
6797 "S.isupper() -> bool\n\
6798 \n\
6799 Return True if all cased characters in S are uppercase and there is\n\
6800 at least one cased character in S, False otherwise.");
6801
6802 static PyObject*
6803 unicode_isupper(PyUnicodeObject *self)
6804 {
6805     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806     register const Py_UNICODE *e;
6807     int cased;
6808
6809     /* Shortcut for single character strings */
6810     if (PyUnicode_GET_SIZE(self) == 1)
6811         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6812
6813     /* Special case for empty strings */
6814     if (PyUnicode_GET_SIZE(self) == 0)
6815         return PyBool_FromLong(0);
6816
6817     e = p + PyUnicode_GET_SIZE(self);
6818     cased = 0;
6819     for (; p < e; p++) {
6820         register const Py_UNICODE ch = *p;
6821
6822         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6823             return PyBool_FromLong(0);
6824         else if (!cased && Py_UNICODE_ISUPPER(ch))
6825             cased = 1;
6826     }
6827     return PyBool_FromLong(cased);
6828 }
6829
6830 PyDoc_STRVAR(istitle__doc__,
6831 "S.istitle() -> bool\n\
6832 \n\
6833 Return True if S is a titlecased string and there is at least one\n\
6834 character in S, i.e. upper- and titlecase characters may only\n\
6835 follow uncased characters and lowercase characters only cased ones.\n\
6836 Return False otherwise.");
6837
6838 static PyObject*
6839 unicode_istitle(PyUnicodeObject *self)
6840 {
6841     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6842     register const Py_UNICODE *e;
6843     int cased, previous_is_cased;
6844
6845     /* Shortcut for single character strings */
6846     if (PyUnicode_GET_SIZE(self) == 1)
6847         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6848                                (Py_UNICODE_ISUPPER(*p) != 0));
6849
6850     /* Special case for empty strings */
6851     if (PyUnicode_GET_SIZE(self) == 0)
6852         return PyBool_FromLong(0);
6853
6854     e = p + PyUnicode_GET_SIZE(self);
6855     cased = 0;
6856     previous_is_cased = 0;
6857     for (; p < e; p++) {
6858         register const Py_UNICODE ch = *p;
6859
6860         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6861             if (previous_is_cased)
6862                 return PyBool_FromLong(0);
6863             previous_is_cased = 1;
6864             cased = 1;
6865         }
6866         else if (Py_UNICODE_ISLOWER(ch)) {
6867             if (!previous_is_cased)
6868                 return PyBool_FromLong(0);
6869             previous_is_cased = 1;
6870             cased = 1;
6871         }
6872         else
6873             previous_is_cased = 0;
6874     }
6875     return PyBool_FromLong(cased);
6876 }
6877
6878 PyDoc_STRVAR(isspace__doc__,
6879 "S.isspace() -> bool\n\
6880 \n\
6881 Return True if all characters in S are whitespace\n\
6882 and there is at least one character in S, False otherwise.");
6883
6884 static PyObject*
6885 unicode_isspace(PyUnicodeObject *self)
6886 {
6887     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6888     register const Py_UNICODE *e;
6889
6890     /* Shortcut for single character strings */
6891     if (PyUnicode_GET_SIZE(self) == 1 &&
6892         Py_UNICODE_ISSPACE(*p))
6893         return PyBool_FromLong(1);
6894
6895     /* Special case for empty strings */
6896     if (PyUnicode_GET_SIZE(self) == 0)
6897         return PyBool_FromLong(0);
6898
6899     e = p + PyUnicode_GET_SIZE(self);
6900     for (; p < e; p++) {
6901         if (!Py_UNICODE_ISSPACE(*p))
6902             return PyBool_FromLong(0);
6903     }
6904     return PyBool_FromLong(1);
6905 }
6906
6907 PyDoc_STRVAR(isalpha__doc__,
6908 "S.isalpha() -> bool\n\
6909 \n\
6910 Return True if all characters in S are alphabetic\n\
6911 and there is at least one character in S, False otherwise.");
6912
6913 static PyObject*
6914 unicode_isalpha(PyUnicodeObject *self)
6915 {
6916     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6917     register const Py_UNICODE *e;
6918
6919     /* Shortcut for single character strings */
6920     if (PyUnicode_GET_SIZE(self) == 1 &&
6921         Py_UNICODE_ISALPHA(*p))
6922         return PyBool_FromLong(1);
6923
6924     /* Special case for empty strings */
6925     if (PyUnicode_GET_SIZE(self) == 0)
6926         return PyBool_FromLong(0);
6927
6928     e = p + PyUnicode_GET_SIZE(self);
6929     for (; p < e; p++) {
6930         if (!Py_UNICODE_ISALPHA(*p))
6931             return PyBool_FromLong(0);
6932     }
6933     return PyBool_FromLong(1);
6934 }
6935
6936 PyDoc_STRVAR(isalnum__doc__,
6937 "S.isalnum() -> bool\n\
6938 \n\
6939 Return True if all characters in S are alphanumeric\n\
6940 and there is at least one character in S, False otherwise.");
6941
6942 static PyObject*
6943 unicode_isalnum(PyUnicodeObject *self)
6944 {
6945     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6946     register const Py_UNICODE *e;
6947
6948     /* Shortcut for single character strings */
6949     if (PyUnicode_GET_SIZE(self) == 1 &&
6950         Py_UNICODE_ISALNUM(*p))
6951         return PyBool_FromLong(1);
6952
6953     /* Special case for empty strings */
6954     if (PyUnicode_GET_SIZE(self) == 0)
6955         return PyBool_FromLong(0);
6956
6957     e = p + PyUnicode_GET_SIZE(self);
6958     for (; p < e; p++) {
6959         if (!Py_UNICODE_ISALNUM(*p))
6960             return PyBool_FromLong(0);
6961     }
6962     return PyBool_FromLong(1);
6963 }
6964
6965 PyDoc_STRVAR(isdecimal__doc__,
6966 "S.isdecimal() -> bool\n\
6967 \n\
6968 Return True if there are only decimal characters in S,\n\
6969 False otherwise.");
6970
6971 static PyObject*
6972 unicode_isdecimal(PyUnicodeObject *self)
6973 {
6974     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6975     register const Py_UNICODE *e;
6976
6977     /* Shortcut for single character strings */
6978     if (PyUnicode_GET_SIZE(self) == 1 &&
6979         Py_UNICODE_ISDECIMAL(*p))
6980         return PyBool_FromLong(1);
6981
6982     /* Special case for empty strings */
6983     if (PyUnicode_GET_SIZE(self) == 0)
6984         return PyBool_FromLong(0);
6985
6986     e = p + PyUnicode_GET_SIZE(self);
6987     for (; p < e; p++) {
6988         if (!Py_UNICODE_ISDECIMAL(*p))
6989             return PyBool_FromLong(0);
6990     }
6991     return PyBool_FromLong(1);
6992 }
6993
6994 PyDoc_STRVAR(isdigit__doc__,
6995 "S.isdigit() -> bool\n\
6996 \n\
6997 Return True if all characters in S are digits\n\
6998 and there is at least one character in S, False otherwise.");
6999
7000 static PyObject*
7001 unicode_isdigit(PyUnicodeObject *self)
7002 {
7003     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7004     register const Py_UNICODE *e;
7005
7006     /* Shortcut for single character strings */
7007     if (PyUnicode_GET_SIZE(self) == 1 &&
7008         Py_UNICODE_ISDIGIT(*p))
7009         return PyBool_FromLong(1);
7010
7011     /* Special case for empty strings */
7012     if (PyUnicode_GET_SIZE(self) == 0)
7013         return PyBool_FromLong(0);
7014
7015     e = p + PyUnicode_GET_SIZE(self);
7016     for (; p < e; p++) {
7017         if (!Py_UNICODE_ISDIGIT(*p))
7018             return PyBool_FromLong(0);
7019     }
7020     return PyBool_FromLong(1);
7021 }
7022
7023 PyDoc_STRVAR(isnumeric__doc__,
7024 "S.isnumeric() -> bool\n\
7025 \n\
7026 Return True if there are only numeric characters in S,\n\
7027 False otherwise.");
7028
7029 static PyObject*
7030 unicode_isnumeric(PyUnicodeObject *self)
7031 {
7032     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7033     register const Py_UNICODE *e;
7034
7035     /* Shortcut for single character strings */
7036     if (PyUnicode_GET_SIZE(self) == 1 &&
7037         Py_UNICODE_ISNUMERIC(*p))
7038         return PyBool_FromLong(1);
7039
7040     /* Special case for empty strings */
7041     if (PyUnicode_GET_SIZE(self) == 0)
7042         return PyBool_FromLong(0);
7043
7044     e = p + PyUnicode_GET_SIZE(self);
7045     for (; p < e; p++) {
7046         if (!Py_UNICODE_ISNUMERIC(*p))
7047             return PyBool_FromLong(0);
7048     }
7049     return PyBool_FromLong(1);
7050 }
7051
7052 PyDoc_STRVAR(join__doc__,
7053 "S.join(sequence) -> unicode\n\
7054 \n\
7055 Return a string which is the concatenation of the strings in the\n\
7056 sequence.  The separator between elements is S.");
7057
7058 static PyObject*
7059 unicode_join(PyObject *self, PyObject *data)
7060 {
7061     return PyUnicode_Join(self, data);
7062 }
7063
7064 static Py_ssize_t
7065 unicode_length(PyUnicodeObject *self)
7066 {
7067     return self->length;
7068 }
7069
7070 PyDoc_STRVAR(ljust__doc__,
7071 "S.ljust(width[, fillchar]) -> int\n\
7072 \n\
7073 Return S left justified in a Unicode string of length width. Padding is\n\
7074 done using the specified fill character (default is a space).");
7075
7076 static PyObject *
7077 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7078 {
7079     Py_ssize_t width;
7080     Py_UNICODE fillchar = ' ';
7081
7082     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7083         return NULL;
7084
7085     if (self->length >= width && PyUnicode_CheckExact(self)) {
7086         Py_INCREF(self);
7087         return (PyObject*) self;
7088     }
7089
7090     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7091 }
7092
7093 PyDoc_STRVAR(lower__doc__,
7094 "S.lower() -> unicode\n\
7095 \n\
7096 Return a copy of the string S converted to lowercase.");
7097
7098 static PyObject*
7099 unicode_lower(PyUnicodeObject *self)
7100 {
7101     return fixup(self, fixlower);
7102 }
7103
7104 #define LEFTSTRIP 0
7105 #define RIGHTSTRIP 1
7106 #define BOTHSTRIP 2
7107
7108 /* Arrays indexed by above */
7109 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7110
7111 #define STRIPNAME(i) (stripformat[i]+3)
7112
7113 /* externally visible for str.strip(unicode) */
7114 PyObject *
7115 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7116 {
7117         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7118         Py_ssize_t len = PyUnicode_GET_SIZE(self);
7119         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7120         Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7121         Py_ssize_t i, j;
7122
7123         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7124
7125         i = 0;
7126         if (striptype != RIGHTSTRIP) {
7127             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7128                 i++;
7129             }
7130         }
7131
7132         j = len;
7133         if (striptype != LEFTSTRIP) {
7134             do {
7135                 j--;
7136             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7137             j++;
7138         }
7139
7140         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7141             Py_INCREF(self);
7142             return (PyObject*)self;
7143         }
7144         else
7145             return PyUnicode_FromUnicode(s+i, j-i);
7146 }
7147
7148
7149 static PyObject *
7150 do_strip(PyUnicodeObject *self, int striptype)
7151 {
7152         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7153         Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7154
7155         i = 0;
7156         if (striptype != RIGHTSTRIP) {
7157                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7158                         i++;
7159                 }
7160         }
7161
7162         j = len;
7163         if (striptype != LEFTSTRIP) {
7164                 do {
7165                         j--;
7166                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7167                 j++;
7168         }
7169
7170         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7171                 Py_INCREF(self);
7172                 return (PyObject*)self;
7173         }
7174         else
7175                 return PyUnicode_FromUnicode(s+i, j-i);
7176 }
7177
7178
7179 static PyObject *
7180 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7181 {
7182         PyObject *sep = NULL;
7183
7184         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7185                 return NULL;
7186
7187         if (sep != NULL && sep != Py_None) {
7188                 if (PyUnicode_Check(sep))
7189                         return _PyUnicode_XStrip(self, striptype, sep);
7190                 else if (PyString_Check(sep)) {
7191                         PyObject *res;
7192                         sep = PyUnicode_FromObject(sep);
7193                         if (sep==NULL)
7194                                 return NULL;
7195                         res = _PyUnicode_XStrip(self, striptype, sep);
7196                         Py_DECREF(sep);
7197                         return res;
7198                 }
7199                 else {
7200                         PyErr_Format(PyExc_TypeError,
7201                                      "%s arg must be None, unicode or str",
7202                                      STRIPNAME(striptype));
7203                         return NULL;
7204                 }
7205         }
7206
7207         return do_strip(self, striptype);
7208 }
7209
7210
7211 PyDoc_STRVAR(strip__doc__,
7212 "S.strip([chars]) -> unicode\n\
7213 \n\
7214 Return a copy of the string S with leading and trailing\n\
7215 whitespace removed.\n\
7216 If chars is given and not None, remove characters in chars instead.\n\
7217 If chars is a str, it will be converted to unicode before stripping");
7218
7219 static PyObject *
7220 unicode_strip(PyUnicodeObject *self, PyObject *args)
7221 {
7222         if (PyTuple_GET_SIZE(args) == 0)
7223                 return do_strip(self, BOTHSTRIP); /* Common case */
7224         else
7225                 return do_argstrip(self, BOTHSTRIP, args);
7226 }
7227
7228
7229 PyDoc_STRVAR(lstrip__doc__,
7230 "S.lstrip([chars]) -> unicode\n\
7231 \n\
7232 Return a copy of the string S with leading whitespace removed.\n\
7233 If chars is given and not None, remove characters in chars instead.\n\
7234 If chars is a str, it will be converted to unicode before stripping");
7235
7236 static PyObject *
7237 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7238 {
7239         if (PyTuple_GET_SIZE(args) == 0)
7240                 return do_strip(self, LEFTSTRIP); /* Common case */
7241         else
7242                 return do_argstrip(self, LEFTSTRIP, args);
7243 }
7244
7245
7246 PyDoc_STRVAR(rstrip__doc__,
7247 "S.rstrip([chars]) -> unicode\n\
7248 \n\
7249 Return a copy of the string S with trailing whitespace removed.\n\
7250 If chars is given and not None, remove characters in chars instead.\n\
7251 If chars is a str, it will be converted to unicode before stripping");
7252
7253 static PyObject *
7254 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7255 {
7256         if (PyTuple_GET_SIZE(args) == 0)
7257                 return do_strip(self, RIGHTSTRIP); /* Common case */
7258         else
7259                 return do_argstrip(self, RIGHTSTRIP, args);
7260 }
7261
7262
7263 static PyObject*
7264 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7265 {
7266     PyUnicodeObject *u;
7267     Py_UNICODE *p;
7268     Py_ssize_t nchars;
7269     size_t nbytes;
7270
7271     if (len < 0)
7272         len = 0;
7273
7274     if (len == 1 && PyUnicode_CheckExact(str)) {
7275         /* no repeat, return original string */
7276         Py_INCREF(str);
7277         return (PyObject*) str;
7278     }
7279
7280     /* ensure # of chars needed doesn't overflow int and # of bytes
7281      * needed doesn't overflow size_t
7282      */
7283     nchars = len * str->length;
7284     if (len && nchars / len != str->length) {
7285         PyErr_SetString(PyExc_OverflowError,
7286                         "repeated string is too long");
7287         return NULL;
7288     }
7289     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7290     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7291         PyErr_SetString(PyExc_OverflowError,
7292                         "repeated string is too long");
7293         return NULL;
7294     }
7295     u = _PyUnicode_New(nchars);
7296     if (!u)
7297         return NULL;
7298
7299     p = u->str;
7300
7301     if (str->length == 1 && len > 0) {
7302         Py_UNICODE_FILL(p, str->str[0], len);
7303     } else {
7304         Py_ssize_t done = 0; /* number of characters copied this far */
7305         if (done < nchars) {
7306             Py_UNICODE_COPY(p, str->str, str->length);
7307             done = str->length;
7308         }
7309         while (done < nchars) {
7310             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7311             Py_UNICODE_COPY(p+done, p, n);
7312             done += n;
7313         }
7314     }
7315
7316     return (PyObject*) u;
7317 }
7318
7319 PyObject *PyUnicode_Replace(PyObject *obj,
7320                             PyObject *subobj,
7321                             PyObject *replobj,
7322                             Py_ssize_t maxcount)
7323 {
7324     PyObject *self;
7325     PyObject *str1;
7326     PyObject *str2;
7327     PyObject *result;
7328
7329     self = PyUnicode_FromObject(obj);
7330     if (self == NULL)
7331         return NULL;
7332     str1 = PyUnicode_FromObject(subobj);
7333     if (str1 == NULL) {
7334         Py_DECREF(self);
7335         return NULL;
7336     }
7337     str2 = PyUnicode_FromObject(replobj);
7338     if (str2 == NULL) {
7339         Py_DECREF(self);
7340         Py_DECREF(str1);
7341         return NULL;
7342     }
7343     result = replace((PyUnicodeObject *)self,
7344                      (PyUnicodeObject *)str1,
7345                      (PyUnicodeObject *)str2,
7346                      maxcount);
7347     Py_DECREF(self);
7348     Py_DECREF(str1);
7349     Py_DECREF(str2);
7350     return result;
7351 }
7352
7353 PyDoc_STRVAR(replace__doc__,
7354 "S.replace (old, new[, count]) -> unicode\n\
7355 \n\
7356 Return a copy of S with all occurrences of substring\n\
7357 old replaced by new.  If the optional argument count is\n\
7358 given, only the first count occurrences are replaced.");
7359
7360 static PyObject*
7361 unicode_replace(PyUnicodeObject *self, PyObject *args)
7362 {
7363     PyUnicodeObject *str1;
7364     PyUnicodeObject *str2;
7365     Py_ssize_t maxcount = -1;
7366     PyObject *result;
7367
7368     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7369         return NULL;
7370     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7371     if (str1 == NULL)
7372         return NULL;
7373     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7374     if (str2 == NULL) {
7375         Py_DECREF(str1);
7376         return NULL;
7377     }
7378
7379     result = replace(self, str1, str2, maxcount);
7380
7381     Py_DECREF(str1);
7382     Py_DECREF(str2);
7383     return result;
7384 }
7385
7386 static
7387 PyObject *unicode_repr(PyObject *unicode)
7388 {
7389     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7390                                 PyUnicode_GET_SIZE(unicode),
7391                                 1);
7392 }
7393
7394 PyDoc_STRVAR(rfind__doc__,
7395 "S.rfind(sub [,start [,end]]) -> int\n\
7396 \n\
7397 Return the highest index in S where substring sub is found,\n\
7398 such that sub is contained within s[start:end].  Optional\n\
7399 arguments start and end are interpreted as in slice notation.\n\
7400 \n\
7401 Return -1 on failure.");
7402
7403 static PyObject *
7404 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7405 {
7406     PyObject *substring;
7407     Py_ssize_t start;
7408     Py_ssize_t end;
7409     Py_ssize_t result;
7410
7411     if (!_ParseTupleFinds(args, &substring, &start, &end))
7412             return NULL;
7413
7414     result = stringlib_rfind_slice(
7415         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7416         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7417         start, end
7418         );
7419
7420     Py_DECREF(substring);
7421
7422     return PyInt_FromSsize_t(result);
7423 }
7424
7425 PyDoc_STRVAR(rindex__doc__,
7426 "S.rindex(sub [,start [,end]]) -> int\n\
7427 \n\
7428 Like S.rfind() but raise ValueError when the substring is not found.");
7429
7430 static PyObject *
7431 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7432 {
7433     PyObject *substring;
7434     Py_ssize_t start;
7435     Py_ssize_t end;
7436     Py_ssize_t result;
7437
7438     if (!_ParseTupleFinds(args, &substring, &start, &end))
7439             return NULL;
7440
7441     result = stringlib_rfind_slice(
7442         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7443         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7444         start, end
7445         );
7446
7447     Py_DECREF(substring);
7448
7449     if (result < 0) {
7450         PyErr_SetString(PyExc_ValueError, "substring not found");
7451         return NULL;
7452     }
7453     return PyInt_FromSsize_t(result);
7454 }
7455
7456 PyDoc_STRVAR(rjust__doc__,
7457 "S.rjust(width[, fillchar]) -> unicode\n\
7458 \n\
7459 Return S right justified in a Unicode string of length width. Padding is\n\
7460 done using the specified fill character (default is a space).");
7461
7462 static PyObject *
7463 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7464 {
7465     Py_ssize_t width;
7466     Py_UNICODE fillchar = ' ';
7467
7468     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7469         return NULL;
7470
7471     if (self->length >= width && PyUnicode_CheckExact(self)) {
7472         Py_INCREF(self);
7473         return (PyObject*) self;
7474     }
7475
7476     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7477 }
7478
7479 static PyObject*
7480 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7481 {
7482     /* standard clamping */
7483     if (start < 0)
7484         start = 0;
7485     if (end < 0)
7486         end = 0;
7487     if (end > self->length)
7488         end = self->length;
7489     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7490         /* full slice, return original string */
7491         Py_INCREF(self);
7492         return (PyObject*) self;
7493     }
7494     if (start > end)
7495         start = end;
7496     /* copy slice */
7497     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7498                                              end - start);
7499 }
7500
7501 PyObject *PyUnicode_Split(PyObject *s,
7502                           PyObject *sep,
7503                           Py_ssize_t maxsplit)
7504 {
7505     PyObject *result;
7506
7507     s = PyUnicode_FromObject(s);
7508     if (s == NULL)
7509         return NULL;
7510     if (sep != NULL) {
7511         sep = PyUnicode_FromObject(sep);
7512         if (sep == NULL) {
7513             Py_DECREF(s);
7514             return NULL;
7515         }
7516     }
7517
7518     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7519
7520     Py_DECREF(s);
7521     Py_XDECREF(sep);
7522     return result;
7523 }
7524
7525 PyDoc_STRVAR(split__doc__,
7526 "S.split([sep [,maxsplit]]) -> list of strings\n\
7527 \n\
7528 Return a list of the words in S, using sep as the\n\
7529 delimiter string.  If maxsplit is given, at most maxsplit\n\
7530 splits are done. If sep is not specified or is None, any\n\
7531 whitespace string is a separator and empty strings are\n\
7532 removed from the result.");
7533
7534 static PyObject*
7535 unicode_split(PyUnicodeObject *self, PyObject *args)
7536 {
7537     PyObject *substring = Py_None;
7538     Py_ssize_t maxcount = -1;
7539
7540     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7541         return NULL;
7542
7543     if (substring == Py_None)
7544         return split(self, NULL, maxcount);
7545     else if (PyUnicode_Check(substring))
7546         return split(self, (PyUnicodeObject *)substring, maxcount);
7547     else
7548         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7549 }
7550
7551 PyObject *
7552 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7553 {
7554     PyObject* str_obj;
7555     PyObject* sep_obj;
7556     PyObject* out;
7557
7558     str_obj = PyUnicode_FromObject(str_in);
7559     if (!str_obj)
7560         return NULL;
7561     sep_obj = PyUnicode_FromObject(sep_in);
7562     if (!sep_obj) {
7563         Py_DECREF(str_obj);
7564         return NULL;
7565     }
7566
7567     out = stringlib_partition(
7568         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7569         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7570         );
7571
7572     Py_DECREF(sep_obj);
7573     Py_DECREF(str_obj);
7574
7575     return out;
7576 }
7577
7578
7579 PyObject *
7580 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7581 {
7582     PyObject* str_obj;
7583     PyObject* sep_obj;
7584     PyObject* out;
7585
7586     str_obj = PyUnicode_FromObject(str_in);
7587     if (!str_obj)
7588         return NULL;
7589     sep_obj = PyUnicode_FromObject(sep_in);
7590     if (!sep_obj) {
7591         Py_DECREF(str_obj);
7592         return NULL;
7593     }
7594
7595     out = stringlib_rpartition(
7596         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7597         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7598         );
7599
7600     Py_DECREF(sep_obj);
7601     Py_DECREF(str_obj);
7602
7603     return out;
7604 }
7605
7606 PyDoc_STRVAR(partition__doc__,
7607 "S.partition(sep) -> (head, sep, tail)\n\
7608 \n\
7609 Searches for the separator sep in S, and returns the part before it,\n\
7610 the separator itself, and the part after it.  If the separator is not\n\
7611 found, returns S and two empty strings.");
7612
7613 static PyObject*
7614 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7615 {
7616     return PyUnicode_Partition((PyObject *)self, separator);
7617 }
7618
7619 PyDoc_STRVAR(rpartition__doc__,
7620 "S.rpartition(sep) -> (tail, sep, head)\n\
7621 \n\
7622 Searches for the separator sep in S, starting at the end of S, and returns\n\
7623 the part before it, the separator itself, and the part after it.  If the\n\
7624 separator is not found, returns two empty strings and S.");
7625
7626 static PyObject*
7627 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7628 {
7629     return PyUnicode_RPartition((PyObject *)self, separator);
7630 }
7631
7632 PyObject *PyUnicode_RSplit(PyObject *s,
7633                            PyObject *sep,
7634                            Py_ssize_t maxsplit)
7635 {
7636     PyObject *result;
7637
7638     s = PyUnicode_FromObject(s);
7639     if (s == NULL)
7640         return NULL;
7641     if (sep != NULL) {
7642         sep = PyUnicode_FromObject(sep);
7643         if (sep == NULL) {
7644             Py_DECREF(s);
7645             return NULL;
7646         }
7647     }
7648
7649     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7650
7651     Py_DECREF(s);
7652     Py_XDECREF(sep);
7653     return result;
7654 }
7655
7656 PyDoc_STRVAR(rsplit__doc__,
7657 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7658 \n\
7659 Return a list of the words in S, using sep as the\n\
7660 delimiter string, starting at the end of the string and\n\
7661 working to the front.  If maxsplit is given, at most maxsplit\n\
7662 splits are done. If sep is not specified, any whitespace string\n\
7663 is a separator.");
7664
7665 static PyObject*
7666 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7667 {
7668     PyObject *substring = Py_None;
7669     Py_ssize_t maxcount = -1;
7670
7671     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7672         return NULL;
7673
7674     if (substring == Py_None)
7675         return rsplit(self, NULL, maxcount);
7676     else if (PyUnicode_Check(substring))
7677         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7678     else
7679         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7680 }
7681
7682 PyDoc_STRVAR(splitlines__doc__,
7683 "S.splitlines([keepends]]) -> list of strings\n\
7684 \n\
7685 Return a list of the lines in S, breaking at line boundaries.\n\
7686 Line breaks are not included in the resulting list unless keepends\n\
7687 is given and true.");
7688
7689 static PyObject*
7690 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7691 {
7692     int keepends = 0;
7693
7694     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7695         return NULL;
7696
7697     return PyUnicode_Splitlines((PyObject *)self, keepends);
7698 }
7699
7700 static
7701 PyObject *unicode_str(PyUnicodeObject *self)
7702 {
7703     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7704 }
7705
7706 PyDoc_STRVAR(swapcase__doc__,
7707 "S.swapcase() -> unicode\n\
7708 \n\
7709 Return a copy of S with uppercase characters converted to lowercase\n\
7710 and vice versa.");
7711
7712 static PyObject*
7713 unicode_swapcase(PyUnicodeObject *self)
7714 {
7715     return fixup(self, fixswapcase);
7716 }
7717
7718 PyDoc_STRVAR(translate__doc__,
7719 "S.translate(table) -> unicode\n\
7720 \n\
7721 Return a copy of the string S, where all characters have been mapped\n\
7722 through the given translation table, which must be a mapping of\n\
7723 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7724 Unmapped characters are left untouched. Characters mapped to None\n\
7725 are deleted.");
7726
7727 static PyObject*
7728 unicode_translate(PyUnicodeObject *self, PyObject *table)
7729 {
7730     return PyUnicode_TranslateCharmap(self->str,
7731                                       self->length,
7732                                       table,
7733                                       "ignore");
7734 }
7735
7736 PyDoc_STRVAR(upper__doc__,
7737 "S.upper() -> unicode\n\
7738 \n\
7739 Return a copy of S converted to uppercase.");
7740
7741 static PyObject*
7742 unicode_upper(PyUnicodeObject *self)
7743 {
7744     return fixup(self, fixupper);
7745 }
7746
7747 PyDoc_STRVAR(zfill__doc__,
7748 "S.zfill(width) -> unicode\n\
7749 \n\
7750 Pad a numeric string x with zeros on the left, to fill a field\n\
7751 of the specified width. The string x is never truncated.");
7752
7753 static PyObject *
7754 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7755 {
7756     Py_ssize_t fill;
7757     PyUnicodeObject *u;
7758
7759     Py_ssize_t width;
7760     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7761         return NULL;
7762
7763     if (self->length >= width) {
7764         if (PyUnicode_CheckExact(self)) {
7765             Py_INCREF(self);
7766             return (PyObject*) self;
7767         }
7768         else
7769             return PyUnicode_FromUnicode(
7770                 PyUnicode_AS_UNICODE(self),
7771                 PyUnicode_GET_SIZE(self)
7772             );
7773     }
7774
7775     fill = width - self->length;
7776
7777     u = pad(self, fill, 0, '0');
7778
7779     if (u == NULL)
7780         return NULL;
7781
7782     if (u->str[fill] == '+' || u->str[fill] == '-') {
7783         /* move sign to beginning of string */
7784         u->str[0] = u->str[fill];
7785         u->str[fill] = '0';
7786     }
7787
7788     return (PyObject*) u;
7789 }
7790
7791 #if 0
7792 static PyObject*
7793 free_listsize(PyUnicodeObject *self)
7794 {
7795     return PyInt_FromLong(numfree);
7796 }
7797 #endif
7798
7799 PyDoc_STRVAR(startswith__doc__,
7800 "S.startswith(prefix[, start[, end]]) -> bool\n\
7801 \n\
7802 Return True if S starts with the specified prefix, False otherwise.\n\
7803 With optional start, test S beginning at that position.\n\
7804 With optional end, stop comparing S at that position.\n\
7805 prefix can also be a tuple of strings to try.");
7806
7807 static PyObject *
7808 unicode_startswith(PyUnicodeObject *self,
7809                    PyObject *args)
7810 {
7811     PyObject *subobj;
7812     PyUnicodeObject *substring;
7813     Py_ssize_t start = 0;
7814     Py_ssize_t end = PY_SSIZE_T_MAX;
7815     int result;
7816
7817     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7818                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7819         return NULL;
7820     if (PyTuple_Check(subobj)) {
7821         Py_ssize_t i;
7822         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7823             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7824                             PyTuple_GET_ITEM(subobj, i));
7825             if (substring == NULL)
7826                 return NULL;
7827             result = tailmatch(self, substring, start, end, -1);
7828             Py_DECREF(substring);
7829             if (result) {
7830                 Py_RETURN_TRUE;
7831             }
7832         }
7833         /* nothing matched */
7834         Py_RETURN_FALSE;
7835     }
7836     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7837     if (substring == NULL)
7838          return NULL;
7839     result = tailmatch(self, substring, start, end, -1);
7840     Py_DECREF(substring);
7841     return PyBool_FromLong(result);
7842 }
7843
7844
7845 PyDoc_STRVAR(endswith__doc__,
7846 "S.endswith(suffix[, start[, end]]) -> bool\n\
7847 \n\
7848 Return True if S ends with the specified suffix, False otherwise.\n\
7849 With optional start, test S beginning at that position.\n\
7850 With optional end, stop comparing S at that position.\n\
7851 suffix can also be a tuple of strings to try.");
7852
7853 static PyObject *
7854 unicode_endswith(PyUnicodeObject *self,
7855                  PyObject *args)
7856 {
7857     PyObject *subobj;
7858     PyUnicodeObject *substring;
7859     Py_ssize_t start = 0;
7860     Py_ssize_t end = PY_SSIZE_T_MAX;
7861     int result;
7862
7863     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7864         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7865         return NULL;
7866     if (PyTuple_Check(subobj)) {
7867         Py_ssize_t i;
7868         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7869             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7870                             PyTuple_GET_ITEM(subobj, i));
7871             if (substring == NULL)
7872             return NULL;
7873             result = tailmatch(self, substring, start, end, +1);
7874             Py_DECREF(substring);
7875             if (result) {
7876                 Py_RETURN_TRUE;
7877             }
7878         }
7879         Py_RETURN_FALSE;
7880     }
7881     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7882     if (substring == NULL)
7883     return NULL;
7884
7885     result = tailmatch(self, substring, start, end, +1);
7886     Py_DECREF(substring);
7887     return PyBool_FromLong(result);
7888 }
7889
7890
7891 /* Implements do_string_format, which is unicode because of stringlib */
7892 #include "stringlib/string_format.h"
7893
7894 PyDoc_STRVAR(format__doc__,
7895 "S.format(*args, **kwargs) -> unicode\n\
7896 \n\
7897 ");
7898
7899 static PyObject *
7900 unicode__format__(PyObject *self, PyObject *args)
7901 {
7902     PyObject *format_spec;
7903     PyObject *result = NULL;
7904     PyObject *tmp = NULL;
7905
7906     /* If 2.x, convert format_spec to the same type as value */
7907     /* This is to allow things like u''.format('') */
7908     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7909         goto done;
7910     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7911         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7912                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7913         goto done;
7914     }
7915     tmp = PyObject_Unicode(format_spec);
7916     if (tmp == NULL)
7917         goto done;
7918     format_spec = tmp;
7919
7920     result = _PyUnicode_FormatAdvanced(self,
7921                                        PyUnicode_AS_UNICODE(format_spec),
7922                                        PyUnicode_GET_SIZE(format_spec));
7923 done:
7924     Py_XDECREF(tmp);
7925     return result;
7926 }
7927
7928 PyDoc_STRVAR(p_format__doc__,
7929 "S.__format__(format_spec) -> unicode\n\
7930 \n\
7931 ");
7932
7933 static PyObject *
7934 unicode__sizeof__(PyUnicodeObject *v)
7935 {
7936     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7937                              sizeof(Py_UNICODE) * (v->length + 1));
7938 }
7939
7940 PyDoc_STRVAR(sizeof__doc__,
7941 "S.__sizeof__() -> size of S in memory, in bytes\n\
7942 \n\
7943 ");
7944
7945 static PyObject *
7946 unicode_getnewargs(PyUnicodeObject *v)
7947 {
7948         return Py_BuildValue("(u#)", v->str, v->length);
7949 }
7950
7951
7952 static PyMethodDef unicode_methods[] = {
7953
7954     /* Order is according to common usage: often used methods should
7955        appear first, since lookup is done sequentially. */
7956
7957     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7958     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7959     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7960     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7961     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7962     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7963     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7964     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7965     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7966     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7967     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7968     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7969     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7970     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7971     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7972     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7973     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7974 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7975     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7976     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7977     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7978     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7979     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7980     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7981     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7982     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7983     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7984     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7985     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7986     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7987     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7988     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7989     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7990     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7991     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7992     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7993     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7994     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7995     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7996     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7997     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7998     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7999     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8000     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8001     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8002 #if 0
8003     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8004 #endif
8005
8006 #if 0
8007     /* This one is just used for debugging the implementation. */
8008     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
8009 #endif
8010
8011     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8012     {NULL, NULL}
8013 };
8014
8015 static PyObject *
8016 unicode_mod(PyObject *v, PyObject *w)
8017 {
8018        if (!PyUnicode_Check(v)) {
8019                Py_INCREF(Py_NotImplemented);
8020                return Py_NotImplemented;
8021        }
8022        return PyUnicode_Format(v, w);
8023 }
8024
8025 static PyNumberMethods unicode_as_number = {
8026         0,                              /*nb_add*/
8027         0,                              /*nb_subtract*/
8028         0,                              /*nb_multiply*/
8029         0,                              /*nb_divide*/
8030         unicode_mod,                    /*nb_remainder*/
8031 };
8032
8033 static PySequenceMethods unicode_as_sequence = {
8034     (lenfunc) unicode_length,           /* sq_length */
8035     PyUnicode_Concat,                   /* sq_concat */
8036     (ssizeargfunc) unicode_repeat,      /* sq_repeat */
8037     (ssizeargfunc) unicode_getitem,     /* sq_item */
8038     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
8039     0,                                  /* sq_ass_item */
8040     0,                                  /* sq_ass_slice */
8041     PyUnicode_Contains,                 /* sq_contains */
8042 };
8043
8044 static PyObject*
8045 unicode_subscript(PyUnicodeObject* self, PyObject* item)
8046 {
8047     if (PyIndex_Check(item)) {
8048         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8049         if (i == -1 && PyErr_Occurred())
8050             return NULL;
8051         if (i < 0)
8052             i += PyUnicode_GET_SIZE(self);
8053         return unicode_getitem(self, i);
8054     } else if (PySlice_Check(item)) {
8055         Py_ssize_t start, stop, step, slicelength, cur, i;
8056         Py_UNICODE* source_buf;
8057         Py_UNICODE* result_buf;
8058         PyObject* result;
8059
8060         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8061                                  &start, &stop, &step, &slicelength) < 0) {
8062             return NULL;
8063         }
8064
8065         if (slicelength <= 0) {
8066             return PyUnicode_FromUnicode(NULL, 0);
8067         } else if (start == 0 && step == 1 && slicelength == self->length &&
8068                    PyUnicode_CheckExact(self)) {
8069             Py_INCREF(self);
8070             return (PyObject *)self;
8071         } else if (step == 1) {
8072             return PyUnicode_FromUnicode(self->str + start, slicelength);
8073         } else {
8074             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8075             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8076                                                        sizeof(Py_UNICODE));
8077
8078             if (result_buf == NULL)
8079                     return PyErr_NoMemory();
8080
8081             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8082                 result_buf[i] = source_buf[cur];
8083             }
8084
8085             result = PyUnicode_FromUnicode(result_buf, slicelength);
8086             PyObject_FREE(result_buf);
8087             return result;
8088         }
8089     } else {
8090         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8091         return NULL;
8092     }
8093 }
8094
8095 static PyMappingMethods unicode_as_mapping = {
8096     (lenfunc)unicode_length,            /* mp_length */
8097     (binaryfunc)unicode_subscript,      /* mp_subscript */
8098     (objobjargproc)0,                   /* mp_ass_subscript */
8099 };
8100
8101 static Py_ssize_t
8102 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8103                           Py_ssize_t index,
8104                           const void **ptr)
8105 {
8106     if (index != 0) {
8107         PyErr_SetString(PyExc_SystemError,
8108                         "accessing non-existent unicode segment");
8109         return -1;
8110     }
8111     *ptr = (void *) self->str;
8112     return PyUnicode_GET_DATA_SIZE(self);
8113 }
8114
8115 static Py_ssize_t
8116 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8117                            const void **ptr)
8118 {
8119     PyErr_SetString(PyExc_TypeError,
8120                     "cannot use unicode as modifiable buffer");
8121     return -1;
8122 }
8123
8124 static int
8125 unicode_buffer_getsegcount(PyUnicodeObject *self,
8126                            Py_ssize_t *lenp)
8127 {
8128     if (lenp)
8129         *lenp = PyUnicode_GET_DATA_SIZE(self);
8130     return 1;
8131 }
8132
8133 static Py_ssize_t
8134 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8135                           Py_ssize_t index,
8136                           const void **ptr)
8137 {
8138     PyObject *str;
8139
8140     if (index != 0) {
8141         PyErr_SetString(PyExc_SystemError,
8142                         "accessing non-existent unicode segment");
8143         return -1;
8144     }
8145     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8146     if (str == NULL)
8147         return -1;
8148     *ptr = (void *) PyString_AS_STRING(str);
8149     return PyString_GET_SIZE(str);
8150 }
8151
8152 /* Helpers for PyUnicode_Format() */
8153
8154 static PyObject *
8155 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8156 {
8157     Py_ssize_t argidx = *p_argidx;
8158     if (argidx < arglen) {
8159         (*p_argidx)++;
8160         if (arglen < 0)
8161             return args;
8162         else
8163             return PyTuple_GetItem(args, argidx);
8164     }
8165     PyErr_SetString(PyExc_TypeError,
8166                     "not enough arguments for format string");
8167     return NULL;
8168 }
8169
8170 #define F_LJUST (1<<0)
8171 #define F_SIGN  (1<<1)
8172 #define F_BLANK (1<<2)
8173 #define F_ALT   (1<<3)
8174 #define F_ZERO  (1<<4)
8175
8176 static Py_ssize_t
8177 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8178 {
8179     register Py_ssize_t i;
8180     Py_ssize_t len = strlen(charbuffer);
8181     for (i = len - 1; i >= 0; i--)
8182         buffer[i] = (Py_UNICODE) charbuffer[i];
8183
8184     return len;
8185 }
8186
8187 static int
8188 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8189 {
8190     Py_ssize_t result;
8191
8192     PyOS_ascii_formatd((char *)buffer, len, format, x);
8193     result = strtounicode(buffer, (char *)buffer);
8194     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8195 }
8196
8197 static int
8198 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8199 {
8200     Py_ssize_t result;
8201
8202     PyOS_snprintf((char *)buffer, len, format, x);
8203     result = strtounicode(buffer, (char *)buffer);
8204     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8205 }
8206
8207 /* XXX To save some code duplication, formatfloat/long/int could have been
8208    shared with stringobject.c, converting from 8-bit to Unicode after the
8209    formatting is done. */
8210
8211 static int
8212 formatfloat(Py_UNICODE *buf,
8213             size_t buflen,
8214             int flags,
8215             int prec,
8216             int type,
8217             PyObject *v)
8218 {
8219     /* fmt = '%#.' + `prec` + `type`
8220        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8221     char fmt[20];
8222     double x;
8223
8224     x = PyFloat_AsDouble(v);
8225     if (x == -1.0 && PyErr_Occurred())
8226         return -1;
8227     if (prec < 0)
8228         prec = 6;
8229     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8230         type = 'g';
8231     /* Worst case length calc to ensure no buffer overrun:
8232
8233        'g' formats:
8234          fmt = %#.<prec>g
8235          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8236             for any double rep.)
8237          len = 1 + prec + 1 + 2 + 5 = 9 + prec
8238
8239        'f' formats:
8240          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8241          len = 1 + 50 + 1 + prec = 52 + prec
8242
8243        If prec=0 the effective precision is 1 (the leading digit is
8244        always given), therefore increase the length by one.
8245
8246     */
8247     if (((type == 'g' || type == 'G') &&
8248           buflen <= (size_t)10 + (size_t)prec) ||
8249         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8250         PyErr_SetString(PyExc_OverflowError,
8251                         "formatted float is too long (precision too large?)");
8252         return -1;
8253     }
8254     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8255                   (flags&F_ALT) ? "#" : "",
8256                   prec, type);
8257     return doubletounicode(buf, buflen, fmt, x);
8258 }
8259
8260 static PyObject*
8261 formatlong(PyObject *val, int flags, int prec, int type)
8262 {
8263         char *buf;
8264         int i, len;
8265         PyObject *str; /* temporary string object. */
8266         PyUnicodeObject *result;
8267
8268         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8269         if (!str)
8270                 return NULL;
8271         result = _PyUnicode_New(len);
8272         if (!result) {
8273                 Py_DECREF(str);
8274                 return NULL;
8275         }
8276         for (i = 0; i < len; i++)
8277                 result->str[i] = buf[i];
8278         result->str[len] = 0;
8279         Py_DECREF(str);
8280         return (PyObject*)result;
8281 }
8282
8283 static int
8284 formatint(Py_UNICODE *buf,
8285           size_t buflen,
8286           int flags,
8287           int prec,
8288           int type,
8289           PyObject *v)
8290 {
8291     /* fmt = '%#.' + `prec` + 'l' + `type`
8292      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8293      *                     + 1 + 1
8294      *                   = 24
8295      */
8296     char fmt[64]; /* plenty big enough! */
8297     char *sign;
8298     long x;
8299
8300     x = PyInt_AsLong(v);
8301     if (x == -1 && PyErr_Occurred())
8302         return -1;
8303     if (x < 0 && type == 'u') {
8304         type = 'd';
8305     }
8306     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8307         sign = "-";
8308     else
8309         sign = "";
8310     if (prec < 0)
8311         prec = 1;
8312
8313     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8314      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8315      */
8316     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8317         PyErr_SetString(PyExc_OverflowError,
8318                 "formatted integer is too long (precision too large?)");
8319         return -1;
8320     }
8321
8322     if ((flags & F_ALT) &&
8323         (type == 'x' || type == 'X')) {
8324         /* When converting under %#x or %#X, there are a number
8325          * of issues that cause pain:
8326          * - when 0 is being converted, the C standard leaves off
8327          *   the '0x' or '0X', which is inconsistent with other
8328          *   %#x/%#X conversions and inconsistent with Python's
8329          *   hex() function
8330          * - there are platforms that violate the standard and
8331          *   convert 0 with the '0x' or '0X'
8332          *   (Metrowerks, Compaq Tru64)
8333          * - there are platforms that give '0x' when converting
8334          *   under %#X, but convert 0 in accordance with the
8335          *   standard (OS/2 EMX)
8336          *
8337          * We can achieve the desired consistency by inserting our
8338          * own '0x' or '0X' prefix, and substituting %x/%X in place
8339          * of %#x/%#X.
8340          *
8341          * Note that this is the same approach as used in
8342          * formatint() in stringobject.c
8343          */
8344         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8345                       sign, type, prec, type);
8346     }
8347     else {
8348         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8349                       sign, (flags&F_ALT) ? "#" : "",
8350                       prec, type);
8351     }
8352     if (sign[0])
8353         return longtounicode(buf, buflen, fmt, -x);
8354     else
8355         return longtounicode(buf, buflen, fmt, x);
8356 }
8357
8358 static int
8359 formatchar(Py_UNICODE *buf,
8360            size_t buflen,
8361            PyObject *v)
8362 {
8363     /* presume that the buffer is at least 2 characters long */
8364     if (PyUnicode_Check(v)) {
8365         if (PyUnicode_GET_SIZE(v) != 1)
8366             goto onError;
8367         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8368     }
8369
8370     else if (PyString_Check(v)) {
8371         if (PyString_GET_SIZE(v) != 1)
8372             goto onError;
8373         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8374     }
8375
8376     else {
8377         /* Integer input truncated to a character */
8378         long x;
8379         x = PyInt_AsLong(v);
8380         if (x == -1 && PyErr_Occurred())
8381             goto onError;
8382 #ifdef Py_UNICODE_WIDE
8383         if (x < 0 || x > 0x10ffff) {
8384             PyErr_SetString(PyExc_OverflowError,
8385                             "%c arg not in range(0x110000) "
8386                             "(wide Python build)");
8387             return -1;
8388         }
8389 #else
8390         if (x < 0 || x > 0xffff) {
8391             PyErr_SetString(PyExc_OverflowError,
8392                             "%c arg not in range(0x10000) "
8393                             "(narrow Python build)");
8394             return -1;
8395         }
8396 #endif
8397         buf[0] = (Py_UNICODE) x;
8398     }
8399     buf[1] = '\0';
8400     return 1;
8401
8402  onError:
8403     PyErr_SetString(PyExc_TypeError,
8404                     "%c requires int or char");
8405     return -1;
8406 }
8407
8408 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8409
8410    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8411    chars are formatted. XXX This is a magic number. Each formatting
8412    routine does bounds checking to ensure no overflow, but a better
8413    solution may be to malloc a buffer of appropriate size for each
8414    format. For now, the current solution is sufficient.
8415 */
8416 #define FORMATBUFLEN (size_t)120
8417
8418 PyObject *PyUnicode_Format(PyObject *format,
8419                            PyObject *args)
8420 {
8421     Py_UNICODE *fmt, *res;
8422     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8423     int args_owned = 0;
8424     PyUnicodeObject *result = NULL;
8425     PyObject *dict = NULL;
8426     PyObject *uformat;
8427
8428     if (format == NULL || args == NULL) {
8429         PyErr_BadInternalCall();
8430         return NULL;
8431     }
8432     uformat = PyUnicode_FromObject(format);
8433     if (uformat == NULL)
8434         return NULL;
8435     fmt = PyUnicode_AS_UNICODE(uformat);
8436     fmtcnt = PyUnicode_GET_SIZE(uformat);
8437
8438     reslen = rescnt = fmtcnt + 100;
8439     result = _PyUnicode_New(reslen);
8440     if (result == NULL)
8441         goto onError;
8442     res = PyUnicode_AS_UNICODE(result);
8443
8444     if (PyTuple_Check(args)) {
8445         arglen = PyTuple_Size(args);
8446         argidx = 0;
8447     }
8448     else {
8449         arglen = -1;
8450         argidx = -2;
8451     }
8452     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8453         !PyObject_TypeCheck(args, &PyBaseString_Type))
8454         dict = args;
8455
8456     while (--fmtcnt >= 0) {
8457         if (*fmt != '%') {
8458             if (--rescnt < 0) {
8459                 rescnt = fmtcnt + 100;
8460                 reslen += rescnt;
8461                 if (_PyUnicode_Resize(&result, reslen) < 0)
8462                     goto onError;
8463                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8464                 --rescnt;
8465             }
8466             *res++ = *fmt++;
8467         }
8468         else {
8469             /* Got a format specifier */
8470             int flags = 0;
8471             Py_ssize_t width = -1;
8472             int prec = -1;
8473             Py_UNICODE c = '\0';
8474             Py_UNICODE fill;
8475             int isnumok;
8476             PyObject *v = NULL;
8477             PyObject *temp = NULL;
8478             Py_UNICODE *pbuf;
8479             Py_UNICODE sign;
8480             Py_ssize_t len;
8481             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8482
8483             fmt++;
8484             if (*fmt == '(') {
8485                 Py_UNICODE *keystart;
8486                 Py_ssize_t keylen;
8487                 PyObject *key;
8488                 int pcount = 1;
8489
8490                 if (dict == NULL) {
8491                     PyErr_SetString(PyExc_TypeError,
8492                                     "format requires a mapping");
8493                     goto onError;
8494                 }
8495                 ++fmt;
8496                 --fmtcnt;
8497                 keystart = fmt;
8498                 /* Skip over balanced parentheses */
8499                 while (pcount > 0 && --fmtcnt >= 0) {
8500                     if (*fmt == ')')
8501                         --pcount;
8502                     else if (*fmt == '(')
8503                         ++pcount;
8504                     fmt++;
8505                 }
8506                 keylen = fmt - keystart - 1;
8507                 if (fmtcnt < 0 || pcount > 0) {
8508                     PyErr_SetString(PyExc_ValueError,
8509                                     "incomplete format key");
8510                     goto onError;
8511                 }
8512 #if 0
8513                 /* keys are converted to strings using UTF-8 and
8514                    then looked up since Python uses strings to hold
8515                    variables names etc. in its namespaces and we
8516                    wouldn't want to break common idioms. */
8517                 key = PyUnicode_EncodeUTF8(keystart,
8518                                            keylen,
8519                                            NULL);
8520 #else
8521                 key = PyUnicode_FromUnicode(keystart, keylen);
8522 #endif
8523                 if (key == NULL)
8524                     goto onError;
8525                 if (args_owned) {
8526                     Py_DECREF(args);
8527                     args_owned = 0;
8528                 }
8529                 args = PyObject_GetItem(dict, key);
8530                 Py_DECREF(key);
8531                 if (args == NULL) {
8532                     goto onError;
8533                 }
8534                 args_owned = 1;
8535                 arglen = -1;
8536                 argidx = -2;
8537             }
8538             while (--fmtcnt >= 0) {
8539                 switch (c = *fmt++) {
8540                 case '-': flags |= F_LJUST; continue;
8541                 case '+': flags |= F_SIGN; continue;
8542                 case ' ': flags |= F_BLANK; continue;
8543                 case '#': flags |= F_ALT; continue;
8544                 case '0': flags |= F_ZERO; continue;
8545                 }
8546                 break;
8547             }
8548             if (c == '*') {
8549                 v = getnextarg(args, arglen, &argidx);
8550                 if (v == NULL)
8551                     goto onError;
8552                 if (!PyInt_Check(v)) {
8553                     PyErr_SetString(PyExc_TypeError,
8554                                     "* wants int");
8555                     goto onError;
8556                 }
8557                 width = PyInt_AsLong(v);
8558                 if (width < 0) {
8559                     flags |= F_LJUST;
8560                     width = -width;
8561                 }
8562                 if (--fmtcnt >= 0)
8563                     c = *fmt++;
8564             }
8565             else if (c >= '0' && c <= '9') {
8566                 width = c - '0';
8567                 while (--fmtcnt >= 0) {
8568                     c = *fmt++;
8569                     if (c < '0' || c > '9')
8570                         break;
8571                     if ((width*10) / 10 != width) {
8572                         PyErr_SetString(PyExc_ValueError,
8573                                         "width too big");
8574                         goto onError;
8575                     }
8576                     width = width*10 + (c - '0');
8577                 }
8578             }
8579             if (c == '.') {
8580                 prec = 0;
8581                 if (--fmtcnt >= 0)
8582                     c = *fmt++;
8583                 if (c == '*') {
8584                     v = getnextarg(args, arglen, &argidx);
8585                     if (v == NULL)
8586                         goto onError;
8587                     if (!PyInt_Check(v)) {
8588                         PyErr_SetString(PyExc_TypeError,
8589                                         "* wants int");
8590                         goto onError;
8591                     }
8592                     prec = PyInt_AsLong(v);
8593                     if (prec < 0)
8594                         prec = 0;
8595                     if (--fmtcnt >= 0)
8596                         c = *fmt++;
8597                 }
8598                 else if (c >= '0' && c <= '9') {
8599                     prec = c - '0';
8600                     while (--fmtcnt >= 0) {
8601                         c = Py_CHARMASK(*fmt++);
8602                         if (c < '0' || c > '9')
8603                             break;
8604                         if ((prec*10) / 10 != prec) {
8605                             PyErr_SetString(PyExc_ValueError,
8606                                             "prec too big");
8607                             goto onError;
8608                         }
8609                         prec = prec*10 + (c - '0');
8610                     }
8611                 }
8612             } /* prec */
8613             if (fmtcnt >= 0) {
8614                 if (c == 'h' || c == 'l' || c == 'L') {
8615                     if (--fmtcnt >= 0)
8616                         c = *fmt++;
8617                 }
8618             }
8619             if (fmtcnt < 0) {
8620                 PyErr_SetString(PyExc_ValueError,
8621                                 "incomplete format");
8622                 goto onError;
8623             }
8624             if (c != '%') {
8625                 v = getnextarg(args, arglen, &argidx);
8626                 if (v == NULL)
8627                     goto onError;
8628             }
8629             sign = 0;
8630             fill = ' ';
8631             switch (c) {
8632
8633             case '%':
8634                 pbuf = formatbuf;
8635                 /* presume that buffer length is at least 1 */
8636                 pbuf[0] = '%';
8637                 len = 1;
8638                 break;
8639
8640             case 's':
8641             case 'r':
8642                 if (PyUnicode_Check(v) && c == 's') {
8643                     temp = v;
8644                     Py_INCREF(temp);
8645                 }
8646                 else {
8647                     PyObject *unicode;
8648                     if (c == 's')
8649                         temp = PyObject_Unicode(v);
8650                     else
8651                         temp = PyObject_Repr(v);
8652                     if (temp == NULL)
8653                         goto onError;
8654                     if (PyUnicode_Check(temp))
8655                         /* nothing to do */;
8656                     else if (PyString_Check(temp)) {
8657                         /* convert to string to Unicode */
8658                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8659                                                    PyString_GET_SIZE(temp),
8660                                                    NULL,
8661                                                    "strict");
8662                         Py_DECREF(temp);
8663                         temp = unicode;
8664                         if (temp == NULL)
8665                             goto onError;
8666                     }
8667                     else {
8668                         Py_DECREF(temp);
8669                         PyErr_SetString(PyExc_TypeError,
8670                                         "%s argument has non-string str()");
8671                         goto onError;
8672                     }
8673                 }
8674                 pbuf = PyUnicode_AS_UNICODE(temp);
8675                 len = PyUnicode_GET_SIZE(temp);
8676                 if (prec >= 0 && len > prec)
8677                     len = prec;
8678                 break;
8679
8680             case 'i':
8681             case 'd':
8682             case 'u':
8683             case 'o':
8684             case 'x':
8685             case 'X':
8686                 if (c == 'i')
8687                     c = 'd';
8688                 isnumok = 0;
8689                 if (PyNumber_Check(v)) {
8690                         PyObject *iobj=NULL;
8691
8692                         if (PyInt_Check(v) || (PyLong_Check(v))) {
8693                                 iobj = v;
8694                                 Py_INCREF(iobj);
8695                         }
8696                         else {
8697                                 iobj = PyNumber_Int(v);
8698                                 if (iobj==NULL) iobj = PyNumber_Long(v);
8699                         }
8700                         if (iobj!=NULL) {
8701                                 if (PyInt_Check(iobj)) {
8702                                         isnumok = 1;
8703                                         pbuf = formatbuf;
8704                                         len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8705                                                     flags, prec, c, iobj);
8706                                         Py_DECREF(iobj);
8707                                         if (len < 0)
8708                                             goto onError;
8709                                         sign = 1;
8710                                 }
8711                                 else if (PyLong_Check(iobj)) {
8712                                         isnumok = 1;
8713                                         temp = formatlong(iobj, flags, prec, c);
8714                                         Py_DECREF(iobj);
8715                                         if (!temp)
8716                                             goto onError;
8717                                         pbuf = PyUnicode_AS_UNICODE(temp);
8718                                         len = PyUnicode_GET_SIZE(temp);
8719                                         sign = 1;
8720                                 }
8721                                 else {
8722                                         Py_DECREF(iobj);
8723                                 }
8724                         }
8725                 }
8726                 if (!isnumok) {
8727                         PyErr_Format(PyExc_TypeError,
8728                             "%%%c format: a number is required, "
8729                                      "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8730                         goto onError;
8731                 }
8732                 if (flags & F_ZERO)
8733                     fill = '0';
8734                 break;
8735
8736             case 'e':
8737             case 'E':
8738             case 'f':
8739             case 'F':
8740             case 'g':
8741             case 'G':
8742                 if (c == 'F')
8743                         c = 'f';
8744                 pbuf = formatbuf;
8745                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8746                         flags, prec, c, v);
8747                 if (len < 0)
8748                     goto onError;
8749                 sign = 1;
8750                 if (flags & F_ZERO)
8751                     fill = '0';
8752                 break;
8753
8754             case 'c':
8755                 pbuf = formatbuf;
8756                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8757                 if (len < 0)
8758                     goto onError;
8759                 break;
8760
8761             default:
8762                 PyErr_Format(PyExc_ValueError,
8763                              "unsupported format character '%c' (0x%x) "
8764                              "at index %zd",
8765                              (31<=c && c<=126) ? (char)c : '?',
8766                              (int)c,
8767                              (Py_ssize_t)(fmt - 1 -
8768                                           PyUnicode_AS_UNICODE(uformat)));
8769                 goto onError;
8770             }
8771             if (sign) {
8772                 if (*pbuf == '-' || *pbuf == '+') {
8773                     sign = *pbuf++;
8774                     len--;
8775                 }
8776                 else if (flags & F_SIGN)
8777                     sign = '+';
8778                 else if (flags & F_BLANK)
8779                     sign = ' ';
8780                 else
8781                     sign = 0;
8782             }
8783             if (width < len)
8784                 width = len;
8785             if (rescnt - (sign != 0) < width) {
8786                 reslen -= rescnt;
8787                 rescnt = width + fmtcnt + 100;
8788                 reslen += rescnt;
8789                 if (reslen < 0) {
8790                     Py_XDECREF(temp);
8791                     PyErr_NoMemory();
8792                     goto onError;
8793                 }
8794                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8795                     Py_XDECREF(temp);
8796                     goto onError;
8797                 }
8798                 res = PyUnicode_AS_UNICODE(result)
8799                     + reslen - rescnt;
8800             }
8801             if (sign) {
8802                 if (fill != ' ')
8803                     *res++ = sign;
8804                 rescnt--;
8805                 if (width > len)
8806                     width--;
8807             }
8808             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8809                 assert(pbuf[0] == '0');
8810                 assert(pbuf[1] == c);
8811                 if (fill != ' ') {
8812                     *res++ = *pbuf++;
8813                     *res++ = *pbuf++;
8814                 }
8815                 rescnt -= 2;
8816                 width -= 2;
8817                 if (width < 0)
8818                     width = 0;
8819                 len -= 2;
8820             }
8821             if (width > len && !(flags & F_LJUST)) {
8822                 do {
8823                     --rescnt;
8824                     *res++ = fill;
8825                 } while (--width > len);
8826             }
8827             if (fill == ' ') {
8828                 if (sign)
8829                     *res++ = sign;
8830                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8831                     assert(pbuf[0] == '0');
8832                     assert(pbuf[1] == c);
8833                     *res++ = *pbuf++;
8834                     *res++ = *pbuf++;
8835                 }
8836             }
8837             Py_UNICODE_COPY(res, pbuf, len);
8838             res += len;
8839             rescnt -= len;
8840             while (--width >= len) {
8841                 --rescnt;
8842                 *res++ = ' ';
8843             }
8844             if (dict && (argidx < arglen) && c != '%') {
8845                 PyErr_SetString(PyExc_TypeError,
8846                                 "not all arguments converted during string formatting");
8847                 Py_XDECREF(temp);
8848                 goto onError;
8849             }
8850             Py_XDECREF(temp);
8851         } /* '%' */
8852     } /* until end */
8853     if (argidx < arglen && !dict) {
8854         PyErr_SetString(PyExc_TypeError,
8855                         "not all arguments converted during string formatting");
8856         goto onError;
8857     }
8858
8859     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8860         goto onError;
8861     if (args_owned) {
8862         Py_DECREF(args);
8863     }
8864     Py_DECREF(uformat);
8865     return (PyObject *)result;
8866
8867  onError:
8868     Py_XDECREF(result);
8869     Py_DECREF(uformat);
8870     if (args_owned) {
8871         Py_DECREF(args);
8872     }
8873     return NULL;
8874 }
8875
8876 static PyBufferProcs unicode_as_buffer = {
8877     (readbufferproc) unicode_buffer_getreadbuf,
8878     (writebufferproc) unicode_buffer_getwritebuf,
8879     (segcountproc) unicode_buffer_getsegcount,
8880     (charbufferproc) unicode_buffer_getcharbuf,
8881 };
8882
8883 static PyObject *
8884 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8885
8886 static PyObject *
8887 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8888 {
8889         PyObject *x = NULL;
8890         static char *kwlist[] = {"string", "encoding", "errors", 0};
8891         char *encoding = NULL;
8892         char *errors = NULL;
8893
8894         if (type != &PyUnicode_Type)
8895                 return unicode_subtype_new(type, args, kwds);
8896         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8897                                           kwlist, &x, &encoding, &errors))
8898             return NULL;
8899         if (x == NULL)
8900                 return (PyObject *)_PyUnicode_New(0);
8901         if (encoding == NULL && errors == NULL)
8902             return PyObject_Unicode(x);
8903         else
8904         return PyUnicode_FromEncodedObject(x, encoding, errors);
8905 }
8906
8907 static PyObject *
8908 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8909 {
8910         PyUnicodeObject *tmp, *pnew;
8911         Py_ssize_t n;
8912
8913         assert(PyType_IsSubtype(type, &PyUnicode_Type));
8914         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8915         if (tmp == NULL)
8916                 return NULL;
8917         assert(PyUnicode_Check(tmp));
8918         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8919         if (pnew == NULL) {
8920                 Py_DECREF(tmp);
8921                 return NULL;
8922         }
8923         pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8924         if (pnew->str == NULL) {
8925                 _Py_ForgetReference((PyObject *)pnew);
8926                 PyObject_Del(pnew);
8927                 Py_DECREF(tmp);
8928                 return PyErr_NoMemory();
8929         }
8930         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8931         pnew->length = n;
8932         pnew->hash = tmp->hash;
8933         Py_DECREF(tmp);
8934         return (PyObject *)pnew;
8935 }
8936
8937 PyDoc_STRVAR(unicode_doc,
8938 "unicode(string [, encoding[, errors]]) -> object\n\
8939 \n\
8940 Create a new Unicode object from the given encoded string.\n\
8941 encoding defaults to the current default string encoding.\n\
8942 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8943
8944 PyTypeObject PyUnicode_Type = {
8945     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8946     "unicode",                          /* tp_name */
8947     sizeof(PyUnicodeObject),            /* tp_size */
8948     0,                                  /* tp_itemsize */
8949     /* Slots */
8950     (destructor)unicode_dealloc,        /* tp_dealloc */
8951     0,                                  /* tp_print */
8952     0,                                  /* tp_getattr */
8953     0,                                  /* tp_setattr */
8954     0,                                  /* tp_compare */
8955     unicode_repr,                       /* tp_repr */
8956     &unicode_as_number,                 /* tp_as_number */
8957     &unicode_as_sequence,               /* tp_as_sequence */
8958     &unicode_as_mapping,                /* tp_as_mapping */
8959     (hashfunc) unicode_hash,            /* tp_hash*/
8960     0,                                  /* tp_call*/
8961     (reprfunc) unicode_str,             /* tp_str */
8962     PyObject_GenericGetAttr,            /* tp_getattro */
8963     0,                                  /* tp_setattro */
8964     &unicode_as_buffer,                 /* tp_as_buffer */
8965     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8966             Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8967     unicode_doc,                        /* tp_doc */
8968     0,                                  /* tp_traverse */
8969     0,                                  /* tp_clear */
8970     PyUnicode_RichCompare,              /* tp_richcompare */
8971     0,                                  /* tp_weaklistoffset */
8972     0,                                  /* tp_iter */
8973     0,                                  /* tp_iternext */
8974     unicode_methods,                    /* tp_methods */
8975     0,                                  /* tp_members */
8976     0,                                  /* tp_getset */
8977     &PyBaseString_Type,                 /* tp_base */
8978     0,                                  /* tp_dict */
8979     0,                                  /* tp_descr_get */
8980     0,                                  /* tp_descr_set */
8981     0,                                  /* tp_dictoffset */
8982     0,                                  /* tp_init */
8983     0,                                  /* tp_alloc */
8984     unicode_new,                        /* tp_new */
8985     PyObject_Del,               /* tp_free */
8986 };
8987
8988 /* Initialize the Unicode implementation */
8989
8990 void _PyUnicode_Init(void)
8991 {
8992     int i;
8993
8994     /* XXX - move this array to unicodectype.c ? */
8995     Py_UNICODE linebreak[] = {
8996         0x000A, /* LINE FEED */
8997         0x000D, /* CARRIAGE RETURN */
8998         0x001C, /* FILE SEPARATOR */
8999         0x001D, /* GROUP SEPARATOR */
9000         0x001E, /* RECORD SEPARATOR */
9001         0x0085, /* NEXT LINE */
9002         0x2028, /* LINE SEPARATOR */
9003         0x2029, /* PARAGRAPH SEPARATOR */
9004     };
9005
9006     /* Init the implementation */
9007     free_list = NULL;
9008     numfree = 0;
9009     unicode_empty = _PyUnicode_New(0);
9010     if (!unicode_empty)
9011         return;
9012
9013     strcpy(unicode_default_encoding, "ascii");
9014     for (i = 0; i < 256; i++)
9015         unicode_latin1[i] = NULL;
9016     if (PyType_Ready(&PyUnicode_Type) < 0)
9017         Py_FatalError("Can't initialize 'unicode'");
9018
9019     /* initialize the linebreak bloom filter */
9020     bloom_linebreak = make_bloom_mask(
9021         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9022         );
9023
9024     PyType_Ready(&EncodingMapType);
9025 }
9026
9027 /* Finalize the Unicode implementation */
9028
9029 int
9030 PyUnicode_ClearFreeList(void)
9031 {
9032     int freelist_size = numfree;
9033     PyUnicodeObject *u;
9034
9035     for (u = free_list; u != NULL;) {
9036         PyUnicodeObject *v = u;
9037         u = *(PyUnicodeObject **)u;
9038         if (v->str)
9039             PyObject_DEL(v->str);
9040         Py_XDECREF(v->defenc);
9041         PyObject_Del(v);
9042         numfree--;
9043     }
9044     free_list = NULL;
9045     assert(numfree == 0);
9046     return freelist_size;
9047 }
9048
9049 void
9050 _PyUnicode_Fini(void)
9051 {
9052     int i;
9053
9054     Py_XDECREF(unicode_empty);
9055     unicode_empty = NULL;
9056
9057     for (i = 0; i < 256; i++) {
9058         if (unicode_latin1[i]) {
9059             Py_DECREF(unicode_latin1[i]);
9060             unicode_latin1[i] = NULL;
9061         }
9062     }
9063     (void)PyUnicode_ClearFreeList();
9064 }
9065
9066 #ifdef __cplusplus
9067 }
9068 #endif
9069
9070
9071 /*
9072 Local variables:
9073 c-basic-offset: 4
9074 indent-tabs-mode: nil
9075 End:
9076 */